def parsing_coin_deeplink(self): ''' Parsing more detail info for each crypto currency. ''' self.init_file() total_workload = len(glob.glob(self.file_path + '/*.html')) workload = len(glob.glob(self.file_path + '/*.html')) for one_file in glob.glob(self.file_path + '/*.html'): start_time = time.time() # fname is saved for pair url if error occur during the parsing process. # i.e., ethereum self.fname = re.compile(r'link/(.*).html').findall(one_file)[0] with open(one_file, 'r') as f: html = f.read() soup = BeautifulSoup(html, 'html.parser') rows = soup.find('tbody', { 'class': 'cmc-details-panel-about__table' }).find_all('tr') # it is also the file_name try: # delete all space, and trans to lower case, because sometimes, names in both # webs are different. coin_name = re.compile(r'(.*) Price').findall( rows[0].find('strong').string)[0].replace(' ', '').lower() mktcap_rank = rows[2].find('td').string.replace('#', '') cir_supply = self.remove_coin_name( rows[5].find('td').string).replace(',', '') all_time_high = self.remove_coin_name( rows[8].find('td').find('div').string).replace('$', '').replace( ',', '') all_time_low = self.remove_coin_name( rows[9].find('td').find('div').string).replace('$', '').replace( ',', '') # [<div>$0.065951 USD /</div>, <div>$0.060618 USD</div>] days_7_high_low = rows[13].find('td').find_all('div') days_7_high = days_7_high_low[0].string.replace('$', '').replace\ ('USD /', '') days_7_low = self.remove_coin_name(days_7_high_low[1].string).\ replace('$','') table_header = soup.find('div',\ {'class':'v5fhlm-0 jdAFKL cmc-details-panel-tabs col-xs-12'}).\ find('ul',{'class':'cmc-tabs__header'}).find_all('li') # /currencies/bytom/ratings/ rating_base = table_header[5].find('a')['href'] rating_url = 'https://coinmarketcap.com/' + rating_base self.df = self.df.append( { 'name': coin_name, 'rank': mktcap_rank, 'circulating_supply': cir_supply, 'all_time_high': all_time_high, 'all_time_low': all_time_low, '7_days_high': days_7_high, '7_days_low': days_7_low, 'rating_url': rating_url }, ignore_index=True) except: print('\nERROR:') print('Website does contain full info about this currency') print('Info of this error is saved to' + self.output_format(self.log_name)) self.save_to_log() workload -= 1 nth_file = total_workload - workload round_time = time.time() - start_time print('\nFinish parsing' + self.output_format(self.fname) + \ '...' * 20 + '(%d/%d)' % (nth_file, total_workload)) wtl(round_time, 0, workload) print('-' * 100 + '\n\n') order = [ 'name', 'rank', 'circulating_supply', 'all_time_high', 'all_time_low', '7_days_high', '7_days_low', 'rating_url' ] self.df = self.df[order] self.df.to_csv(self.coin_df_name) print(self.df)
def parsing_gecko_deeplink(self): ''' Parsing more detail info for each crypto currency. ''' self.init_file() self.deeplink_dataframe = pd.read_csv('500deeplinks.csv') total_workload = len(glob.glob(self.file_path + '/*.html')) workload = len(glob.glob(self.file_path + '/*.html')) for one_file in glob.glob(self.file_path + '/*.html'): start_time = time.time() # fname is saved for pair url if error occur during the parsing process. # i.e., ethereum self.fname = re.compile(r'link/(.*).html').findall(one_file)[0] # return df like this: https://www.coingecko.com/en/coins/arpa-chain arpa-chain # https://coinmarketcap.com/currencies/medibloc medibloc # Use str.contains to extract rows for gecko. We don't need coinmktcap's info here. self.url_base = self.deeplink_dataframe.loc[ self.deeplink_dataframe['name'] == self.fname] self.url_base = self.url_base.loc[ self.url_base['deeplinks'].str.contains('www.coingecko.com')] self.url_base = self.url_base.iloc[:, 1].values[0] with open(one_file, 'r') as f: html = f.read() name = re.compile( r'<th scope="row" class="border-top-0"><strong>(.*?) Price</strong></th>' ).findall(html)[0].lower().replace(' ', '') rank = re.compile( r'<th scope="row">Market Cap Rank</th>\n<td>\n#(.*?)\n</td>' ).findall(html)[0] ROI = str(re.compile(r'ROI').findall(html)) soup = BeautifulSoup(html, 'html.parser') rows = soup.find( 'div', { 'class': 'col-lg-4 card-column d-flex flex-column-reverse flex-sm-column order-3 order-sm-3 order-md-3 order-lg-2' }).find('table').find_all('tr') table_header = soup.find('div', { 'class': 'tab-content' }).find('ul').find_all('li') rating_base = table_header[6].find('a')['href'] rating_url = self.url_base + rating_base print(rating_url) try: # some page has ROI in the panel, this may change the order of all other tds if ROI == "['ROI']": days_7_high_low = rows[7].find('td').find_all('span') days_7_low = days_7_high_low[0].string.replace('$', '').replace( ',', '') days_7_high = days_7_high_low[1].string.replace( '$', '').replace(',', '') all_time_high = rows[9].find('td').find( 'span').string.replace('$', '').replace(',', '') all_time_low = rows[10].find('td').find( 'span').string.replace('$', '').replace(',', '') else: days_7_high_low = rows[6].find('td').find_all('span') days_7_low = days_7_high_low[0].string.replace('$', '').replace( ',', '') days_7_high = days_7_high_low[1].string.replace( '$', '').replace(',', '') all_time_high = rows[8].find('td').find( 'span').string.replace('$', '').replace(',', '') all_time_low = rows[9].find('td').find( 'span').string.replace('$', '').replace(',', '') self.df = self.df.append( { 'name': name, 'rank': rank, 'all_time_high': all_time_high, 'all_time_low': all_time_low, '7_days_high': days_7_high, '7_days_low': days_7_low, 'rating_url': rating_url, }, ignore_index=True) except: print('\nERROR:') print('Website does contain full info about this currency') print('Info of this error is saved to' + self.output_format(self.log_name)) self.save_to_log() workload -= 1 nth_file = total_workload - workload round_time = time.time() - start_time print('\nFinish parsing' + self.output_format(self.fname) + \ '...' * 20 + '(%d/%d)' % (nth_file, total_workload)) wtl(round_time, 0, workload) print('-' * 100 + '\n\n') order = [ 'name', 'rank', 'all_time_high', 'all_time_low', '7_days_high', '7_days_low', 'rating_url' ] self.df = self.df[order] self.df.to_csv(self.gecko_df_name) print(self.df)
def parsing_coin_deeplink(self): ''' Parsing more detail info for each crypto currency. ''' # remove error log when you restart this program. if os.path.exists(self.log_name + '.csv'): os.remove(self.log_name + '.csv') self.init_file() url_check_list = pd.read_csv('500deeplinks.csv') total_workload = len(glob.glob(self.file_path + '/*.html')) workload = len(glob.glob(self.file_path + '/*.html')) ### test #for one_file in glob.glob(self.file_path + '/polkadot-new.html'): for one_file in glob.glob(self.file_path + '/*.html'): start_time = time.time() # fname is saved for pair url if error occur during the parsing process. # i.e., ethereum self.fname = re.compile(r'link/(.*).html').findall(one_file)[0] with open(one_file, 'r') as f: html = f.read() soup = BeautifulSoup(html, 'html.parser') try: rows = soup.find('tbody', {'class':'cmc-details-panel-about__table'}).find_all('tr') # delete all space, and trans to lower case, because sometimes, names in both # webs are different. mktcap_rank = rows[2].find('td').string.replace('#', '') cir_supply = self.remove_coin_name(rows[5].find('td').string).replace(',', '') all_time_high = self.remove_coin_name(rows[8].find('td').find('div').string).replace('$','').replace(',', '') all_time_low = self.remove_coin_name(rows[9].find('td').find('div').string).replace('$','').replace(',', '') # [<div>$0.065951 USD /</div>, <div>$0.060618 USD</div>] days_7_high_low = rows[13].find('td').find_all('div') days_7_high = days_7_high_low[0].string.replace('$', '').replace\ ('USD /', '') days_7_low = self.remove_coin_name(days_7_high_low[1].string).\ replace('$','') self.df = self.df.append({ 'name':self.fname, 'rank':mktcap_rank, 'circulating_supply':cir_supply, 'all_time_high':all_time_high, 'all_time_low':all_time_low, '7_days_high':days_7_high, '7_days_low':days_7_low, },ignore_index = True) except: try: # these coinmkt web, has four side panels, info are in different tables = soup.find_all('tbody',{'class':'cmc-details-panel-about__table'}) rank = tables[0].find_all('tr')[4].find('td').string.replace('#', '') cir_supply = re.compile(r'(.*) \w*').findall(tables[-1].find_all('tr')[0].find('td').string)[0].replace(',','') all_time_high = re.compile(r'\$(.*) \w*').findall(tables[2].find_all('tr')[4].find('td').find('div').string)[0] all_time_low = re.compile(r'\$(.*) \w*').findall(tables[2].find_all('tr')[5].find('td').find('div').string)[0] days_7_high = re.compile(r'\$(.*) \w* \/').findall(tables[2].find_all('tr')[0].find('td').find('div').string)[0] days_7_low = re.compile(r'\$(.*) \w*').findall(tables[2].find_all('tr')[0].find('td').find_all('div')[1].string)[0] self.df = self.df.append({ 'name':self.fname, 'rank':mktcap_rank, 'circulating_supply':cir_supply, 'all_time_high':all_time_high, 'all_time_low':all_time_low, '7_days_high':days_7_high, '7_days_low':days_7_low, },ignore_index = True) except: print('\nERROR:') print('Website does contain full info about this currency') print('Info of this error is saved to' + self.output_format(self.log_name)) # you should add name and deeplinks(url) to df even if other detail info can't be parsed. # otherwise, you cannot pair gecko's url with coin's url, i.e., 20 coin_url and 19 gecko_url self.df = self.df.append({ 'name':self.fname, }, ignore_index = True) self.save_to_log('coinmkt') print(one_file) workload -= 1 nth_file = total_workload - workload round_time = time.time() - start_time print('\nFinish parsing' + self.output_format(self.fname) + \ '...' * 20 + '(%d/%d)' % (nth_file, total_workload)) wtl(round_time, 0, workload) print('-' * 100 + '\n\n') order = ['name','rank','circulating_supply', 'all_time_high', 'all_time_low', '7_days_high', '7_days_low'] self.df = self.df[order] #self.df.to_csv(self.coin_df_name) print(self.df)
def parsing_gecko_deeplink(self): ''' Parsing more detail info for each crypto currency. ''' self.init_file() self.deeplink_dataframe = pd.read_csv('500deeplinks.csv') total_workload = len(glob.glob(self.file_path + '/*.html')) workload = len(glob.glob(self.file_path + '/*.html')) for one_file in glob.glob(self.file_path + '/*.html'): start_time = time.time() # fname is saved for pair url if error occur during the parsing process. # i.e., ethereum self.fname = re.compile(r'link/(.*).html').findall(one_file)[0] # return df like this: https://www.coingecko.com/en/coins/arpa-chain arpa-chain # https://coinmarketcap.com/currencies/medibloc medibloc # Use str.contains to extract rows for gecko. We don't need coinmktcap's info here. self.url_base = self.deeplink_dataframe.loc[self.deeplink_dataframe['name'] == self.fname] self.url_base = self.url_base.loc[self.url_base['deeplinks'].str.contains('www.coingecko.com')] # [unname, name, deeplinks] self.url_base = self.url_base.iloc[:, 2].values[0] with open(one_file, 'r') as f: html = f.read() ROI = str(re.compile(r'ROI').findall(html)) soup = BeautifulSoup(html, 'html.parser') try: rows = soup.find('div',{'class':'col-lg-4 card-column d-flex flex-column-reverse flex-sm-column order-3 order-sm-3 order-md-3 order-lg-2'}).find('table').find_all('tr') # some page has ROI in the panel, this may change the order of all other tds if ROI == "['ROI']": rank = re.compile(r'#(\d*)').findall(rows[8].find('td').string)[0] days_7_high_low = rows[7].find('td').find_all('span') days_7_low = days_7_high_low[0].string.replace('$','').replace(',','') days_7_high = days_7_high_low[1].string.replace('$','').replace(',','') all_time_high =rows[9].find('td').find('span').string.replace('$','').replace(',','') all_time_low = rows[10].find('td').find('span').string.replace('$','').replace(',','') else: rank = re.compile(r'#(\d*)').findall(rows[7].find('td').string)[0] days_7_high_low = rows[6].find('td').find_all('span') days_7_low = days_7_high_low[0].string.replace('$','').replace(',','') days_7_high = days_7_high_low[1].string.replace('$','').replace(',','') all_time_high = rows[8].find('td').find('span').string.replace('$','').replace(',','') all_time_low = rows[9].find('td').find('span').string.replace('$','').replace(',','') self.df = self.df.append({ 'name':self.fname, 'rank':rank, 'all_time_high':all_time_high, 'all_time_low':all_time_low, '7_days_high':days_7_high, '7_days_low':days_7_low, },ignore_index = True) except: print('\nERROR:') print('Website does contain full info about this currency') print('Info of this error is saved to' + self.output_format(self.log_name)) # you should add name and deeplinks(url) to df even if other detail info can't be parsed. # otherwise, you cannot pair gecko's url with coin's url, i.e., 20 coin_url and 19 gecko_url self.df = self.df.append({ 'name':self.fname, }, ignore_index = True) self.save_to_log('gecko') print(one_file) workload -= 1 nth_file = total_workload - workload round_time = time.time() - start_time print('\nFinish parsing' + self.output_format(self.fname) + \ '...' * 20 + '(%d/%d)' % (nth_file, total_workload)) wtl(round_time, 0, workload) print('-' * 100 + '\n\n') order = ['name','rank', 'all_time_high', 'all_time_low', '7_days_high', '7_days_low'] self.df = self.df[order] self.df.to_csv(self.gecko_df_name) print(self.df)
def parsing_coin_html(self): ''' Parsing coinmktcap html files ''' self.init_file() total_workload = len(glob.glob(self.file_path + '/*.html')) workload = len(glob.glob(self.file_path + '/*.html')) for one_file in glob.glob(self.file_path + '/*.html'): start_time = time.time() # first round: 1_15, last round: 192_15 rep_round = re.compile(r'(\d*_15)_page').findall(one_file)[0] with open(one_file, 'r') as f: html = f.read() soup = BeautifulSoup(html, 'html.parser') rows = soup.find('tbody').find_all('tr', \ {'class':'rc-table-row rc-table-row-level-0 cmc-table-row'}) for row in rows: tds = row.find_all('td') rank = tds[1].find('p').string name = tds[2].find('div').find('p').string.replace(' ','').lower() abbrev = tds[2].find('div').find('p', \ {'class':'Text-sc-1eb5slv-0 eweNDy coin-item-symbol'}).string price = tds[3].find('div').find('a').string.replace('$','').replace(',','') volume = tds[7].find('a').find('p').string.replace('$', '').replace(',','') mktcap = tds[6].find('p').string.replace('$', '').replace(',','') # identify which round you are. first 15mins with '1', last 15 mins with '192' repetition = re.compile(r'(\d*)_15') .findall(rep_round)[0] # regex return repetition as str, be careful if repetition == '1': url_base = 'https://coinmarketcap.com' link_base = tds[2].find('a')['href'] self.deeplink = url_base + link_base self.url_name = self.get_url_name(link_base, '0') self.deep_scrapping_url.append(self.deeplink) #self.deeplink = '' self.df = self.df.append({ 'repetition':repetition, 'rank':rank, 'name':name, 'url_name':self.url_name, 'price':price, 'abbr':abbrev, '24hr_volume': volume, 'mktcap':mktcap, 'deeplink':self.deeplink },ignore_index = True) round_time = time.time() - start_time print('parsing' + self.output_format('CoinMKT_file') + '-' * 100 + '(%s/%s)' % (total_workload - workload + 1, total_workload)) wtl(round_time, 0, workload - 1) print('\n\n') workload -= 1 order = ['repetition','rank', 'name', 'abbr', 'url_name', 'price', '24hr_volume', 'mktcap', 'deeplink'] self.df = self.df[order] print(self.df) self.df.to_csv('CoinMKT_48hrs_data.csv') print('\nNB: Prepare' + self.output_format(len(self.deep_scrapping_url)) + 'deeplink URLs for you.\n\n') return self.deep_scrapping_url
def parsing_gecko_html(self): ''' Parsing coingecko html files ''' self.init_file() total_workload = len(glob.glob(self.file_path + '/*.html')) workload = len(glob.glob(self.file_path + '/*.html')) for one_file in glob.glob(self.file_path + '/*.html'): start_time = time.time() # first round: 1_15, last round: 192_15 rep_round = re.compile(r'(\d*_15)_page').findall(one_file)[0] with open(one_file, 'r') as f: html = f.read() rows = BeautifulSoup(html, 'html.parser').find('tbody').find_all('tr') for row in rows: tds = row.find_all('td') rank = re.compile(r'\d*').findall(tds[1].string)[1] # ['', 'Bitcoin', 'Diamond' ,''] name_frac = re.compile(r'(\w*)').findall(tds[2].find('a',\ {'class':'d-none d-lg-flex font-bold align-items-center justify-content-between'}).string.replace(' ','').lower()) name = name_frac[1] for name_char in name_frac[2:]: if name_char != '': name += name_char abbr_frac = re.compile(r'\w*').findall(tds[2].find('a', \ {'class':'d-lg-none font-bold'}).string.replace(' ','')) abbr = abbr_frac[1] for abbr_char in abbr_frac[2:]: if abbr_char != '': abbr += abbr_char price = tds[3].find('span').string.replace('$','').replace(',','') try: volume = tds[7].find('span').string.replace('$','').replace(',','') except: # if no information, 0 volume = 0 mktcap = tds[8].find('span').string.replace('$','').replace(',','') repetition = re.compile(r'(\d*)_15').findall(rep_round)[0] if repetition == '1': url_base = 'https://www.coingecko.com' link_base = tds[2].find('a',{'class':'d-lg-none font-bold'})['href'] self.deeplink = url_base + link_base # need this url_name to match deeplink info with this 48hrs data # note, later programs use url_name for "name". self.url_name = self.get_url_name(link_base, '1') self.deep_scrapping_url.append(self.deeplink) #self.deeplink = '' self.df = self.df.append({ 'repetition':repetition, 'rank':rank, 'name':name, 'url_name':self.url_name, 'price':price, 'abbr':abbr, '24hr_volume':volume, 'mktcap':mktcap, 'deeplink':self.deeplink },ignore_index = True) round_time = time.time() - start_time print('parsing' + self.output_format('Gecko_file') + '-' * 100 + '(%s/%s)' % (total_workload - workload + 1, total_workload)) wtl(round_time, 0, workload - 1) print('\n\n') workload -= 1 order = ['repetition','rank', 'name', 'abbr', 'url_name', 'price', '24hr_volume', 'mktcap', 'deeplink'] self.df = self.df[order] print(self.df) self.df.to_csv('Gecko_48hrs_data.csv') print('\nNB: Prepare' + self.output_format(len(self.deep_scrapping_url)) + 'deeplink URLs for you.\n\n') return self.deep_scrapping_url