コード例 #1
0
ファイル: deeplink_parsing.py プロジェクト: yhao21/trypython
    def parsing_coin_deeplink(self):
        '''
        Parsing more detail info for each crypto currency.
        '''

        self.init_file()

        total_workload = len(glob.glob(self.file_path + '/*.html'))
        workload = len(glob.glob(self.file_path + '/*.html'))

        for one_file in glob.glob(self.file_path + '/*.html'):

            start_time = time.time()
            # fname is saved for pair url if error occur during the parsing process.
            # i.e., ethereum
            self.fname = re.compile(r'link/(.*).html').findall(one_file)[0]

            with open(one_file, 'r') as f:
                html = f.read()

            soup = BeautifulSoup(html, 'html.parser')
            rows = soup.find('tbody', {
                'class': 'cmc-details-panel-about__table'
            }).find_all('tr')

            # it is also the file_name
            try:
                # delete all space, and trans to lower case, because sometimes, names in both
                # webs are different.
                coin_name = re.compile(r'(.*) Price').findall(
                    rows[0].find('strong').string)[0].replace(' ', '').lower()
                mktcap_rank = rows[2].find('td').string.replace('#', '')
                cir_supply = self.remove_coin_name(
                    rows[5].find('td').string).replace(',', '')
                all_time_high = self.remove_coin_name(
                    rows[8].find('td').find('div').string).replace('$',
                                                                   '').replace(
                                                                       ',', '')
                all_time_low = self.remove_coin_name(
                    rows[9].find('td').find('div').string).replace('$',
                                                                   '').replace(
                                                                       ',', '')

                # [<div>$0.065951 USD /</div>, <div>$0.060618 USD</div>]
                days_7_high_low = rows[13].find('td').find_all('div')
                days_7_high = days_7_high_low[0].string.replace('$', '').replace\
                        ('USD /', '')
                days_7_low = self.remove_coin_name(days_7_high_low[1].string).\
                        replace('$','')
                table_header = soup.find('div',\
                        {'class':'v5fhlm-0 jdAFKL cmc-details-panel-tabs col-xs-12'}).\
                        find('ul',{'class':'cmc-tabs__header'}).find_all('li')
                # /currencies/bytom/ratings/
                rating_base = table_header[5].find('a')['href']
                rating_url = 'https://coinmarketcap.com/' + rating_base

                self.df = self.df.append(
                    {
                        'name': coin_name,
                        'rank': mktcap_rank,
                        'circulating_supply': cir_supply,
                        'all_time_high': all_time_high,
                        'all_time_low': all_time_low,
                        '7_days_high': days_7_high,
                        '7_days_low': days_7_low,
                        'rating_url': rating_url
                    },
                    ignore_index=True)

            except:
                print('\nERROR:')
                print('Website does contain full info about this currency')
                print('Info of this error is saved to' +
                      self.output_format(self.log_name))
                self.save_to_log()

            workload -= 1
            nth_file = total_workload - workload
            round_time = time.time() - start_time
            print('\nFinish parsing' + self.output_format(self.fname) + \
                    '...' * 20 + '(%d/%d)' % (nth_file, total_workload))
            wtl(round_time, 0, workload)
            print('-' * 100 + '\n\n')

        order = [
            'name', 'rank', 'circulating_supply', 'all_time_high',
            'all_time_low', '7_days_high', '7_days_low', 'rating_url'
        ]
        self.df = self.df[order]
        self.df.to_csv(self.coin_df_name)
        print(self.df)
コード例 #2
0
ファイル: deeplink_parsing.py プロジェクト: yhao21/trypython
    def parsing_gecko_deeplink(self):
        '''
        Parsing more detail info for each crypto currency.
        '''

        self.init_file()
        self.deeplink_dataframe = pd.read_csv('500deeplinks.csv')

        total_workload = len(glob.glob(self.file_path + '/*.html'))
        workload = len(glob.glob(self.file_path + '/*.html'))

        for one_file in glob.glob(self.file_path + '/*.html'):

            start_time = time.time()
            # fname is saved for pair url if error occur during the parsing process.
            # i.e., ethereum
            self.fname = re.compile(r'link/(.*).html').findall(one_file)[0]

            # return df like this:  https://www.coingecko.com/en/coins/arpa-chain  arpa-chain
            #                       https://coinmarketcap.com/currencies/medibloc  medibloc
            # Use str.contains to extract rows for gecko. We don't need coinmktcap's info here.
            self.url_base = self.deeplink_dataframe.loc[
                self.deeplink_dataframe['name'] == self.fname]
            self.url_base = self.url_base.loc[
                self.url_base['deeplinks'].str.contains('www.coingecko.com')]
            self.url_base = self.url_base.iloc[:, 1].values[0]

            with open(one_file, 'r') as f:
                html = f.read()

            name = re.compile(
                r'<th scope="row" class="border-top-0"><strong>(.*?) Price</strong></th>'
            ).findall(html)[0].lower().replace(' ', '')
            rank = re.compile(
                r'<th scope="row">Market Cap Rank</th>\n<td>\n#(.*?)\n</td>'
            ).findall(html)[0]
            ROI = str(re.compile(r'ROI').findall(html))

            soup = BeautifulSoup(html, 'html.parser')
            rows = soup.find(
                'div', {
                    'class':
                    'col-lg-4 card-column d-flex flex-column-reverse flex-sm-column order-3 order-sm-3 order-md-3 order-lg-2'
                }).find('table').find_all('tr')

            table_header = soup.find('div', {
                'class': 'tab-content'
            }).find('ul').find_all('li')
            rating_base = table_header[6].find('a')['href']
            rating_url = self.url_base + rating_base
            print(rating_url)

            try:
                # some page has ROI in the panel, this may change the order of all other tds
                if ROI == "['ROI']":
                    days_7_high_low = rows[7].find('td').find_all('span')
                    days_7_low = days_7_high_low[0].string.replace('$',
                                                                   '').replace(
                                                                       ',', '')
                    days_7_high = days_7_high_low[1].string.replace(
                        '$', '').replace(',', '')
                    all_time_high = rows[9].find('td').find(
                        'span').string.replace('$', '').replace(',', '')
                    all_time_low = rows[10].find('td').find(
                        'span').string.replace('$', '').replace(',', '')
                else:
                    days_7_high_low = rows[6].find('td').find_all('span')
                    days_7_low = days_7_high_low[0].string.replace('$',
                                                                   '').replace(
                                                                       ',', '')
                    days_7_high = days_7_high_low[1].string.replace(
                        '$', '').replace(',', '')
                    all_time_high = rows[8].find('td').find(
                        'span').string.replace('$', '').replace(',', '')
                    all_time_low = rows[9].find('td').find(
                        'span').string.replace('$', '').replace(',', '')

                self.df = self.df.append(
                    {
                        'name': name,
                        'rank': rank,
                        'all_time_high': all_time_high,
                        'all_time_low': all_time_low,
                        '7_days_high': days_7_high,
                        '7_days_low': days_7_low,
                        'rating_url': rating_url,
                    },
                    ignore_index=True)

            except:
                print('\nERROR:')
                print('Website does contain full info about this currency')
                print('Info of this error is saved to' +
                      self.output_format(self.log_name))
                self.save_to_log()

            workload -= 1
            nth_file = total_workload - workload
            round_time = time.time() - start_time
            print('\nFinish parsing' + self.output_format(self.fname) + \
                    '...' * 20 + '(%d/%d)' % (nth_file, total_workload))
            wtl(round_time, 0, workload)
            print('-' * 100 + '\n\n')

        order = [
            'name', 'rank', 'all_time_high', 'all_time_low', '7_days_high',
            '7_days_low', 'rating_url'
        ]
        self.df = self.df[order]
        self.df.to_csv(self.gecko_df_name)
        print(self.df)
コード例 #3
0
    def parsing_coin_deeplink(self):
        '''
        Parsing more detail info for each crypto currency.
        '''

        # remove error log when you restart this program.
        if os.path.exists(self.log_name + '.csv'):
            os.remove(self.log_name + '.csv')

        self.init_file()
        url_check_list = pd.read_csv('500deeplinks.csv')

        total_workload = len(glob.glob(self.file_path + '/*.html'))
        workload = len(glob.glob(self.file_path + '/*.html'))

        ### test
        #for one_file in glob.glob(self.file_path + '/polkadot-new.html'):
        for one_file in glob.glob(self.file_path + '/*.html'):

            start_time = time.time()
            # fname is saved for pair url if error occur during the parsing process.
            # i.e., ethereum
            self.fname = re.compile(r'link/(.*).html').findall(one_file)[0]
            

            with open(one_file, 'r') as f:
                html = f.read()

            soup = BeautifulSoup(html, 'html.parser')

            try:
                rows = soup.find('tbody', {'class':'cmc-details-panel-about__table'}).find_all('tr')
                # delete all space, and trans to lower case, because sometimes, names in both
                # webs are different.
                mktcap_rank = rows[2].find('td').string.replace('#', '')
                cir_supply = self.remove_coin_name(rows[5].find('td').string).replace(',', '')
                all_time_high = self.remove_coin_name(rows[8].find('td').find('div').string).replace('$','').replace(',', '')
                all_time_low = self.remove_coin_name(rows[9].find('td').find('div').string).replace('$','').replace(',', '')

                # [<div>$0.065951 USD /</div>, <div>$0.060618 USD</div>]
                days_7_high_low = rows[13].find('td').find_all('div')
                days_7_high = days_7_high_low[0].string.replace('$', '').replace\
                        ('USD /', '')
                days_7_low = self.remove_coin_name(days_7_high_low[1].string).\
                        replace('$','')

                self.df = self.df.append({
                    'name':self.fname,
                    'rank':mktcap_rank,
                    'circulating_supply':cir_supply,
                    'all_time_high':all_time_high,
                    'all_time_low':all_time_low,
                    '7_days_high':days_7_high,
                    '7_days_low':days_7_low,
                    },ignore_index = True)

            except:
                try:
                    # these coinmkt web, has four side panels, info are in different 
                    tables = soup.find_all('tbody',{'class':'cmc-details-panel-about__table'})
                    rank = tables[0].find_all('tr')[4].find('td').string.replace('#', '')
                    cir_supply = re.compile(r'(.*) \w*').findall(tables[-1].find_all('tr')[0].find('td').string)[0].replace(',','')
                    all_time_high = re.compile(r'\$(.*) \w*').findall(tables[2].find_all('tr')[4].find('td').find('div').string)[0]
                    all_time_low = re.compile(r'\$(.*) \w*').findall(tables[2].find_all('tr')[5].find('td').find('div').string)[0]
                    days_7_high = re.compile(r'\$(.*) \w* \/').findall(tables[2].find_all('tr')[0].find('td').find('div').string)[0]
                    days_7_low = re.compile(r'\$(.*) \w*').findall(tables[2].find_all('tr')[0].find('td').find_all('div')[1].string)[0]

                    self.df = self.df.append({
                        'name':self.fname,
                        'rank':mktcap_rank,
                        'circulating_supply':cir_supply,
                        'all_time_high':all_time_high,
                        'all_time_low':all_time_low,
                        '7_days_high':days_7_high,
                        '7_days_low':days_7_low,
                        },ignore_index = True)

                except:

                    print('\nERROR:')
                    print('Website does contain full info about this currency')
                    print('Info of this error is saved to' + self.output_format(self.log_name))
                    # you should add name and deeplinks(url) to df even if other detail info can't be parsed.
                    # otherwise, you cannot pair gecko's url with coin's url, i.e., 20 coin_url and 19 gecko_url
                    self.df = self.df.append({
                        'name':self.fname,
                        }, ignore_index = True)
                    self.save_to_log('coinmkt')
                    print(one_file)

            workload -= 1
            nth_file = total_workload - workload
            round_time = time.time() - start_time
            print('\nFinish parsing' + self.output_format(self.fname) + \
                    '...' * 20 + '(%d/%d)' % (nth_file, total_workload))
            wtl(round_time, 0, workload)
            print('-' * 100 + '\n\n')


        order = ['name','rank','circulating_supply', 'all_time_high', 'all_time_low', '7_days_high', '7_days_low']
        self.df = self.df[order]
        #self.df.to_csv(self.coin_df_name)
        print(self.df)
コード例 #4
0
    def parsing_gecko_deeplink(self):
        '''
        Parsing more detail info for each crypto currency.
        '''

        self.init_file()
        self.deeplink_dataframe = pd.read_csv('500deeplinks.csv')


        total_workload = len(glob.glob(self.file_path + '/*.html'))
        workload = len(glob.glob(self.file_path + '/*.html'))

        for one_file in glob.glob(self.file_path + '/*.html'):

            start_time = time.time()
            # fname is saved for pair url if error occur during the parsing process.
            # i.e., ethereum
            self.fname = re.compile(r'link/(.*).html').findall(one_file)[0]

            # return df like this:  https://www.coingecko.com/en/coins/arpa-chain  arpa-chain
            #                       https://coinmarketcap.com/currencies/medibloc  medibloc
            # Use str.contains to extract rows for gecko. We don't need coinmktcap's info here.
            self.url_base = self.deeplink_dataframe.loc[self.deeplink_dataframe['name'] == self.fname]
            self.url_base = self.url_base.loc[self.url_base['deeplinks'].str.contains('www.coingecko.com')]
            # [unname, name, deeplinks]
            self.url_base = self.url_base.iloc[:, 2].values[0]

            with open(one_file, 'r') as f:
                html = f.read()

            ROI = str(re.compile(r'ROI').findall(html))

            soup = BeautifulSoup(html, 'html.parser')

            try:
                rows = soup.find('div',{'class':'col-lg-4 card-column d-flex flex-column-reverse flex-sm-column order-3 order-sm-3 order-md-3 order-lg-2'}).find('table').find_all('tr')
                # some page has ROI in the panel, this may change the order of all other tds
                if ROI == "['ROI']":
                    rank = re.compile(r'#(\d*)').findall(rows[8].find('td').string)[0]
                    days_7_high_low = rows[7].find('td').find_all('span')
                    days_7_low = days_7_high_low[0].string.replace('$','').replace(',','')
                    days_7_high = days_7_high_low[1].string.replace('$','').replace(',','')
                    all_time_high =rows[9].find('td').find('span').string.replace('$','').replace(',','')
                    all_time_low = rows[10].find('td').find('span').string.replace('$','').replace(',','')
                else:
                    rank = re.compile(r'#(\d*)').findall(rows[7].find('td').string)[0]
                    days_7_high_low = rows[6].find('td').find_all('span')
                    days_7_low = days_7_high_low[0].string.replace('$','').replace(',','')
                    days_7_high = days_7_high_low[1].string.replace('$','').replace(',','')
                    all_time_high = rows[8].find('td').find('span').string.replace('$','').replace(',','')
                    all_time_low = rows[9].find('td').find('span').string.replace('$','').replace(',','')

                self.df = self.df.append({
                    'name':self.fname,
                    'rank':rank,
                    'all_time_high':all_time_high,
                    'all_time_low':all_time_low,
                    '7_days_high':days_7_high,
                    '7_days_low':days_7_low,
                    },ignore_index = True)

            except:
                print('\nERROR:')
                print('Website does contain full info about this currency')
                print('Info of this error is saved to' + self.output_format(self.log_name))

                # you should add name and deeplinks(url) to df even if other detail info can't be parsed.
                # otherwise, you cannot pair gecko's url with coin's url, i.e., 20 coin_url and 19 gecko_url
                self.df = self.df.append({
                    'name':self.fname,
                    }, ignore_index = True)
                self.save_to_log('gecko')
                print(one_file)

            workload -= 1
            nth_file = total_workload - workload
            round_time = time.time() - start_time
            print('\nFinish parsing' + self.output_format(self.fname) + \
                    '...' * 20 + '(%d/%d)' % (nth_file, total_workload))
            wtl(round_time, 0, workload)
            print('-' * 100 + '\n\n')


        order = ['name','rank', 'all_time_high', 'all_time_low', '7_days_high', '7_days_low']
        self.df = self.df[order]
        self.df.to_csv(self.gecko_df_name)
        print(self.df)
コード例 #5
0
    def parsing_coin_html(self):
        '''
        Parsing coinmktcap html files
        '''

        self.init_file()

        total_workload = len(glob.glob(self.file_path + '/*.html'))
        workload = len(glob.glob(self.file_path + '/*.html'))
        
        for one_file in glob.glob(self.file_path + '/*.html'):

            start_time = time.time()
            # first round: 1_15, last round: 192_15
            rep_round = re.compile(r'(\d*_15)_page').findall(one_file)[0]

            with open(one_file, 'r') as f:
                html = f.read()
            
            soup = BeautifulSoup(html, 'html.parser')
            rows = soup.find('tbody').find_all('tr', \
                    {'class':'rc-table-row rc-table-row-level-0 cmc-table-row'})
            
            for row in rows:
                tds = row.find_all('td')
                rank = tds[1].find('p').string
                name = tds[2].find('div').find('p').string.replace(' ','').lower()
                abbrev = tds[2].find('div').find('p', \
                        {'class':'Text-sc-1eb5slv-0 eweNDy coin-item-symbol'}).string
                price = tds[3].find('div').find('a').string.replace('$','').replace(',','')
                volume = tds[7].find('a').find('p').string.replace('$', '').replace(',','')
                mktcap = tds[6].find('p').string.replace('$', '').replace(',','')

                # identify which round you are. first 15mins with '1', last 15 mins with '192'
                repetition = re.compile(r'(\d*)_15') .findall(rep_round)[0]


                # regex return repetition as str, be careful
                if repetition == '1':
                    url_base = 'https://coinmarketcap.com'
                    link_base = tds[2].find('a')['href']
                    self.deeplink = url_base + link_base
                    self.url_name = self.get_url_name(link_base, '0')
                    self.deep_scrapping_url.append(self.deeplink)

                #self.deeplink = ''

                self.df = self.df.append({
                    'repetition':repetition,
                    'rank':rank,
                    'name':name,
                    'url_name':self.url_name,
                    'price':price,
                    'abbr':abbrev,
                    '24hr_volume': volume,
                    'mktcap':mktcap,
                    'deeplink':self.deeplink
                    },ignore_index = True)



            round_time = time.time() - start_time
            print('parsing' + self.output_format('CoinMKT_file') + '-' * 100 + '(%s/%s)' % (total_workload - workload + 1, total_workload))
            wtl(round_time, 0, workload - 1)
            print('\n\n')
            workload -= 1


        order = ['repetition','rank', 'name', 'abbr', 'url_name', 'price', '24hr_volume', 'mktcap', 'deeplink']
        self.df = self.df[order]
        print(self.df)
        self.df.to_csv('CoinMKT_48hrs_data.csv')
        print('\nNB: Prepare' + self.output_format(len(self.deep_scrapping_url)) + 'deeplink URLs for you.\n\n')


        return self.deep_scrapping_url
コード例 #6
0
    def parsing_gecko_html(self):
        '''
        Parsing coingecko html files
        '''

        self.init_file()

        total_workload = len(glob.glob(self.file_path + '/*.html'))
        workload = len(glob.glob(self.file_path + '/*.html'))

        for one_file in glob.glob(self.file_path + '/*.html'):

            start_time = time.time()
            # first round: 1_15, last round: 192_15
            rep_round = re.compile(r'(\d*_15)_page').findall(one_file)[0]

            with open(one_file, 'r') as f:
                html = f.read()

            rows = BeautifulSoup(html, 'html.parser').find('tbody').find_all('tr')
            for row in rows:
                tds = row.find_all('td')
                rank = re.compile(r'\d*').findall(tds[1].string)[1]
                # ['', 'Bitcoin',  'Diamond' ,'']
                name_frac = re.compile(r'(\w*)').findall(tds[2].find('a',\
                        {'class':'d-none d-lg-flex font-bold align-items-center justify-content-between'}).string.replace(' ','').lower())
                name = name_frac[1]
                for name_char in name_frac[2:]:
                    if name_char != '':
                        name += name_char

                abbr_frac = re.compile(r'\w*').findall(tds[2].find('a', \
                        {'class':'d-lg-none font-bold'}).string.replace(' ',''))
                abbr = abbr_frac[1]
                for abbr_char in abbr_frac[2:]:
                    if abbr_char != '':
                        abbr += abbr_char
                
                price = tds[3].find('span').string.replace('$','').replace(',','')

                try:
                    volume = tds[7].find('span').string.replace('$','').replace(',','')
                except:
                    # if no information, 0
                    volume = 0
                mktcap = tds[8].find('span').string.replace('$','').replace(',','')
                repetition = re.compile(r'(\d*)_15').findall(rep_round)[0]

                if repetition == '1':
                    url_base = 'https://www.coingecko.com'
                    link_base = tds[2].find('a',{'class':'d-lg-none font-bold'})['href']
                    self.deeplink = url_base + link_base
                    # need this url_name to match deeplink info with this 48hrs data
                    # note, later programs use url_name for "name".
                    self.url_name = self.get_url_name(link_base, '1')
                    self.deep_scrapping_url.append(self.deeplink)

                #self.deeplink = ''

                self.df = self.df.append({
                    'repetition':repetition,
                    'rank':rank,
                    'name':name,
                    'url_name':self.url_name,
                    'price':price,
                    'abbr':abbr,
                    '24hr_volume':volume,
                    'mktcap':mktcap,
                    'deeplink':self.deeplink
                    },ignore_index = True)


            round_time = time.time() - start_time
            print('parsing' + self.output_format('Gecko_file') + '-' * 100 + '(%s/%s)' % (total_workload - workload + 1, total_workload))
            wtl(round_time, 0, workload - 1)
            print('\n\n')
            workload -= 1

            
        order = ['repetition','rank', 'name', 'abbr', 'url_name', 'price', '24hr_volume', 'mktcap', 'deeplink']
        self.df = self.df[order]
        print(self.df)
        self.df.to_csv('Gecko_48hrs_data.csv')
        print('\nNB: Prepare' + self.output_format(len(self.deep_scrapping_url)) + 'deeplink URLs for you.\n\n')


        return self.deep_scrapping_url