Example #1
0
 def Cold_boot(self, url, pause=3):
     """
     retry if errors are met
     """
     headers = {'user-agent': self.get_random_user_agent()}
     try:
         requests.packages.urllib3.disable_warnings(
             requests.packages.urllib3.exceptions.InsecureRequestWarning)
         r = requests.get(url=url,
                          proxies=self.proxies,
                          headers=headers,
                          allow_redirects=False,
                          verify=False,
                          timeout=30)
         time.sleep(pause)
         ylog.info(url)
         content = r.content
         charset = cchardet.detect(content)
         bsObj = content.decode(charset['encoding'])
         return bsObj
     except (ValueError, Exception) as e:
         # print('something')
         print(e.message)
         print("Sleeping for %i" % self.error_delay)
         time.sleep(self.error_delay)
         return self.Cold_boot(url)
Example #2
0
    def extract(self, text):
        soup = BeautifulSoup(text, "lxml")
        papers = soup.find_all('div', class_='g')
        result = []
        for item in papers:
            information = {
                'Title': None,
                'PageURL': None,
                'text': None,
                'source': None,
                'time': None
            }
            try:
                pub, created_datetime = item.find(
                    'div', class_='slp').find('span').get_text().split('-')
                created_datetime = self.clear_time(created_datetime.strip())
            except ValueError as e:
                ylog.info(item.find('div', class_='slp').find_all('span'))
                pub = item.find('div', class_='slp').find('span').get_text()
                created_datetime = None
            except:
                continue
            Title = BeautifulSoup(str(item.find('h3')), "lxml").get_text()
            PageURL = item.find('a')['href']
            MatchedAbstract = item.find('div', class_='st').get_text()
            information = {
                'Title': Title,
                'PageURL': PageURL,
                'Publication': pub.replace('\u200e ', ''),
                'MatchedAbstract': MatchedAbstract,
                'CreatedTime': created_datetime
            }

            result.append(information)

            # ylog.debug(Title)
        return result
Example #3
0
def find_meta(title, doi):
    """ find metadata with title or doi
    Keyword Arguments:
    title --
    doi   --
    """
    ylog.info(title)
    works = Works()
    w1 = works.query(title).sort('relevance').order('desc')
    i = 0
    for item in w1:
        i = i + 1
        try:
            t = item.get('title')[0]
            sub_title = item.get('subtitle')[0]
        except:
            continue
        if SequenceMatcher(a=title, b=t).ratio() > 0.9 or SequenceMatcher(
                a=title, b=sub_title).ratio > 0.9:
            return item
        if i > 18:
            ylog.debug('[x]%s' % title)
            # ylog.debug(item['title'])
            return None
Example #4
0
    def find_meta(self, identifier):
        """ find metadata with title or DOI
        Keyword Arguments:
        identifier --
        """
        try:
            # verify=False is dangerous but sci-hub.io
            # requires intermediate certificates to verify
            # and requests doesn't know how to download them.
            # as a hacky fix, you can add them to your store
            # and verifying would work. will fix this later.
            url = self.base_url + identifier['article_link']
            self.sess.headers = {'user-agent': self.get_random_user_agent()}
            res = self.sess.get(url, verify=False, allow_redirects=False)
            re_bracket = re.compile("\[(.*?)\]\s")
            title = re.sub(re_bracket, "", identifier['name'])
            ylog.debug('*' * 80)
            ylog.debug("title: %s" % title)
            ylog.debug(res.status_code)
            # self.out.ix[title]['status_code'] = res.status_code
            ylog.debug("headers: %s" % res.headers['Content-Type'])
            ylog.debug('location: %s' % res.headers.get("Location"))
            # self.out.ix[title]['location'] = res.headers.get("Location")
            search_title = True
            if not res.headers.get("Location"):
                content = res.content
                if len(content) > 2:
                    import cchardet
                    charset = cchardet.detect(content)
                    text = content.decode(charset['encoding'])
                    soup = BeautifulSoup(text, "lxml")
                    script = soup.script.get_text()
                    doi_regexp = '10[.][0-9]{4,}(?:[.][0-9]+)*/(?:(?!["&\'<>])\S)+'
                    try:
                        doi_match = re.compile(doi_regexp).findall(script)[0]
                        ylog.info("DOI: %s" % doi_match)
                        search_title = False
                        # use crossref API to get metadata
                        works = Works()
                        w1 = works.query(doi_match).sort('relevance').order(
                            'desc')
                        i = 0
                        for item in w1:
                            # TODO: verify title
                            # self.out.ix[title]['DOI'] = item['DOI']
                            return {'meta': item['DOI'], 'url': url}
                    except IndexError:
                        ylog.debug('failed to find regexp')
            elif search_title:
                works = Works()
                w1 = works.query(title).sort('relevance').order('desc')
                i = 0
                for item in w1:
                    i = i + 1
                    try:
                        # ylog.debug('crossref item title ')
                        t = item.get('title')[0]
                        # ylog.debug(t)
                        sub_title = item.get('subtitle')[0]
                        # ylog.debug(sub_title)
                        # ylog.debug("ratio: %s" %
                        #            (SequenceMatcher(a=title, b=t).ratio()))
                    except TypeError:
                        sub_title = ''
                    if SequenceMatcher(
                            a=title, b=t).ratio() > 0.9 or SequenceMatcher(
                                a=title, b=sub_title).ratio(
                                ) > 0.9 or t.startswith(title):
                        ylog.debug("DOI %s" % item['DOI'])
                        # self.out.ix[title]['DOI'] = item['DOI']
                        return {'meta': item['DOI'], 'url': url}
                    if i > 18:
                        # ylog.debug('[x]%s' % title)
                        # ylog.debug(item['title'])
                        return None

        except requests.exceptions.ConnectionError:
            logger.info('{} cannot acess,changing'.format(
                self.available_base_url_list[0]))
            self._change_base_url()

        except requests.exceptions.RequestException as e:

            return {
                'err':
                'Failed to fetch pdf with identifier %s (resolved url %s) due to request exception.'
                % (identifier, url)
            }
Example #5
0
def upload_node(dict_re_match_object):
    """ upload regular expression object in the dictionary in a batch.
    1. get each value from the input dictionary.
    2. create a graph upload request.
    3. fill node properties.
    use encoded original Chinese title plus url as url property.
    4. if there's any error upload response, retry.
    5. print upload statistics.
    Keyword Arguments:
    re_match_object -- re object
    """
    res = None
    error = None
    re_upload_error = None
    retry = 0
    nodes_fail_retry = 0
    uploaded_number = 0
    while res is None:
        try:
            graph_upload_request = graphUpload_pb2.GraphUploadRequest()
            # iterate nodes batch
            for index, value in dict_re_match_object.items():
                if value is not None:
                    item = dict_re_match_object.get(index)
                    # print(item)
                    title = item.group()[1:-1]
                    zh_title = HanziConv.toSimplified(title)
                    # if zh_title in IGNORE_CATEGORIES:
                    #     break
                    node = graph_upload_request.graph.nodes.add()
                    node.props.type = "readonlyDoc"
                    # p1 = node.props.props.entries.add()
                    # p1.key = "url"
                    # p1.value = "https://www.google.com.hk/search?hl=en&source=hp&q=" + quote_plus(
                    #     title)
                    p2 = node.props.props.entries.add()
                    p2.key = "_s_import_source"
                    p2.value = "word2vec model"

                    node.businessID.url = "https://www.google.com.hk/search?hl=en&source=hp&q=" + quote_plus(
                        title)
                    node.names.chinese = zh_title

            # other information of the upload request
            graph_upload_request.uploadTag = "UploadWord2VecVocabNodes"
            graph_upload_request.nodeAction4Duplication = graphUpload_pb2.Action4Duplication.Value(
                'UPDATE')
            graph_upload_request.edgeAction4Duplication = graphUpload_pb2.Action4Duplication.Value(
                'UPDATE')
            res = gs_call.upload_graph(graph_upload_request)
        except HTTPError as e:
            if e.code in RETRIABLE_STATUS_CODES:
                error = 'A retriable HTTP error %d occurred:\n%s' % (e.code,
                                                                     e.reason)
            else:
                raise
        except RETRIABLE_EXCEPTIONS as e:
            error = 'A retriable error occurred: %s' % e
        try:
            if res.failedNodes:
                re_upload_error = "some nodes failed to upload %s" % res.failedNodeds
        except:
            pass
        if re_upload_error is not None:
            print(re_upload_error)
            nodes_fail_retry += 1
            res = None
            if nodes_fail_retry > NODES_FAIL_MAX_RETRIES:
                ylog.debug(res)
                res = "continue"

        if error is not None:
            print(error)
            retry += 1
            res = None
            if retry > MAX_RETRIES:
                ylog.debug(res)
                # break
                # exit("no loger attempting to retry.")
            ylog.debug(res)
            max_sleep = 2**retry
            sleep_seconds = random.random() * max_sleep
            print('Sleeping %f seconds and then retrying...' % sleep_seconds)
            time.sleep(sleep_seconds)
    # ylog.debug(res)
    # jump out while response is None:
    try:
        if res.nodeUpdateResultStatistics:
            ylog.debug(res.nodeUpdateResultStatistics)
            uploaded_number = res.nodeUpdateResultStatistics.numOfCreations + \
                res.nodeUpdateResultStatistics.numOfUpdates + \
                res.nodeUpdateResultStatistics.numOfSkips
        if res.uploadedNodes:
            for updated in res.uploadedNodes:
                ylog.debug("uploaded node GID: %s" % updated.gid)
        if res.failedNodes:
            for err in res.failedNodes:
                if err.error.errorCode != 202001:
                    ylog.info(err.error)
                    ylog.debug(err.error)
    except:
        pass

    return uploaded_number
#!/usr/bin/env python
# -*- coding: utf-8 -*-
"""
"""
import logging
from ylib import ylog
from lib.gftTools import gftIO
from lib.gftTools.proto import graphUpload_pb2
from lib.gftTools.gftIO import GSError

ylog.set_level(logging.DEBUG)
ylog.console_on()
# ylog.filelog_on("wiki_upload")
ylog.info("start")


def skill_result_2_graph(resp_run_node_action):
    """
    convert skill_pb2.RespRunNodeAction to graphUpload.proto.Graph
    so that in python we have only one graph format.

    Keyword Arguments:
    resp_run_node_action -- skill_pb2.RespRunNodeAction, result from get_graph_from_neo4j

    Return:
    graphUpload_pb2.graph
    """
    graph_upload_request = graphUpload_pb2.GraphUploadRequest()

    for n in resp_run_node_action.graphs[0].graph.nodes:
        node = graph_upload_request.graph.nodes.add()
Example #7
0
def create_continuous_contract(start_date, end_date, contract_data, target):
    ''' parse contract data to get continuous price for each group.
    Parameters
    ----------
    start_date: datetime
    end_date: datetime
    contract_data: OOTTV
    contract name, contract code, date, settlement date, close price
    target: list or NULL
        targets to parse, NULL will parse all contracts.

    Returns
    -------
    continuous_price: DataFrame
    '''

    if isinstance(contract_data, gftIO.GftTable):
        data = contract_data.asColumnTab().copy()

    if isinstance(target, list):
        target = gftIO.strSet2Np(np.array(target))

    name = {
        'INNERCODE': 'contract_code',
        'OPTIONCODE': 'contract_name',
        'SETTLEMENTDATE': 'settlement_date',
        'ENDDATE': 'date',
        'CLOSEPRICE': 'close_price'
    }
    data.rename(columns=lambda x: name[x], inplace=True)
    data.dropna(subset=['settlement_date'], inplace=True)
    continuous_price = pd.DataFrame()

    if target is None:
        target = data['contract_name'].unique()

    for num_contract, contract in enumerate(target):
        ylog.info(num_contract)
        ylog.info(contract)
        target_data = data[data['contract_name'] == contract]
        target_expiry_dates = target_data[['contract_code', 'settlement_date']].\
            drop_duplicates().sort_values('settlement_date')
        target_expiry_dates.set_index('contract_code', inplace=True)
        target_expiry_dates = target_expiry_dates[target_expiry_dates.columns[
            0]]
        target_data = target_data.loc[:,
                                      ['date', 'contract_code', 'close_price']]
        contract_data = target_data.pivot(
            index='date', columns='contract_code', values='close_price')
        contract_dates = contract_data.index
        continuous_contract_price = pd.Series(
            np.ones(len(contract_dates)), index=contract_dates, name=contract)
        # ylog.info(contract_dates)
        prev_date = contract_dates[0]
        # Loop through each contract and create the specific weightings for
        # each contract depending upon the rollover date and price adjusted method.
        # Here for backtesting, we use last trading day rollover and backward ratio price adjustment.
        target_data_with_datetimeindex = target_data.set_index('date')
        price_adjust_ratio = pd.Series(
            np.ones(len(target_expiry_dates)),
            index=target_expiry_dates.values,
            name='ratio')
        adjusted_price = pd.Series(index=contract_dates, name=contract)
        target_data_with_datetimeindex['close_price'].replace(
            to_replace=0, method='bfill', inplace=True)
        target_data_with_datetimeindex['close_price'].replace(
            to_replace=0, method='pad', inplace=True)
        target_data_with_datetimeindex = target_data_with_datetimeindex[
            ~target_data_with_datetimeindex.index.duplicated()]
        for i, (item, ex_date) in enumerate(target_expiry_dates.iteritems()):
            # ylog.info(i)
            # ylog.info(item)
            # ylog.info(ex_date)
            if i < len(target_expiry_dates) - 1 \
               and ex_date < target_data_with_datetimeindex.index[-1]:
                idx_ex_date = target_data_with_datetimeindex.index.searchsorted(
                    ex_date)
                pre_ex_date = contract_dates[idx_ex_date - 1]
                # ex_date has no price data, move ex_date to next trading date.
                if ex_date not in target_data_with_datetimeindex.index and \
                   idx_ex_date + 1 < len(target_data_with_datetimeindex.index):
                    ex_date = contract_dates[idx_ex_date + 1]
                else:
                    continue
                price_adjust_ratio.loc[ex_date] = target_data_with_datetimeindex['close_price'].loc[ex_date] / \
                    target_data_with_datetimeindex['close_price'].loc[pre_ex_date]

        # to create adjusted_pricested price by the product of target price date and
        # adjustment ratio.
        for i, (item, ex_date) in enumerate(target_expiry_dates.iteritems()):
            #print(i, item, ex_date)
            idx_ex_date = contract_data.index.searchsorted(ex_date)
            pre_ex_date = contract_dates[idx_ex_date - 1]
            adjusted_price.ix[prev_date:pre_ex_date] = target_data_with_datetimeindex['close_price'].ix[prev_date:pre_ex_date] * \
                price_adjust_ratio.ix[ex_date:].cumprod().iloc[-1]
            prev_date = ex_date
        continuous_price = pd.concat([continuous_price, adjusted_price], axis=1)
    return continuous_price
Example #8
0
        ylog.debug('[x]%s' % title)
        # ylog.debug(item['title'])
        break

dl.download_from_doi('10.1145/2449396.2449413')

with open('/home/weiwu/share/deep_learning/data/My Collection.bib'
          ) as bibtex_file:
    bib_database = bibtexparser.load(bibtex_file)
items = []
for article in bib_database.entries:
    if article['ENTRYTYPE'] == 'article':
        if article.get('doi') is not None:
            title = article['title'][1:-1]
            # title = ' '.join(['+' + x for x in title.split()])
            ylog.info(title)
            result = {'target': title}
            w1 = works.query(title).sort('relevance').order('desc')
            i = 0
            for item in w1:
                i = i + 1
                try:
                    t = item.get('title')[0]
                except:
                    continue
                if SequenceMatcher(a=title, b=t).ratio() > 0.9:
                    result['result'] = item['title']
                    target_doi = article.get('doi').lower()
                    found_doi = item['DOI'].lower()
                    ylog.debug("target doi: %s" % target_doi)
                    ylog.debug("found  doi: %s" % found_doi)