shutil.rmtree('/home/sahil/fundamentals6') os.mkdir('fundamentals6') options = Options() options.headless=True driver = webdriver.Firefox(options=options,executable_path='/home/sahil/Downloads/geckodriver') driver.get('https://www.moneycontrol.com/markets/earnings/latest-results/latest/yoy/standalone/') os.chdir('/home/sahil/fundamentals6') def ext_symb(pg): sym = pg.find('ctag',class_='mob-hide').text.encode('ascii') bse = re.findall('\d+',sym[sym.find('BSE')+3:]) nse = re.findall('\w+',sym[sym.find('NSE')+3:]) return bse,nse urls =[] def map1(lst): urls.append(lst['href']) pg = bsoap(driver.page_source,'html.parser') map(map1,pg.find_all('a',class_='op_gld13')) k=[] for i in range(2732): try: driver.get(urls[i]) except: continue try: driver.find_element_by_class_name('Ratios').click() except: print('exception caught') continue pg1 = bsoap(driver.page_source,'html.parser') title = pg1.find('h1',class_='pcstname').text bse,nse = ext_symb(pg1)
def scrapedata(self): os.chdir('/home/sahil/projdir/fundamentals8') caps = DesiredCapabilities().FIREFOX if (self.ch1 == 0): caps["pageLoadStrategy"] = "eager" else: caps["pageLoadStrategy"] = "normal" options = Options() options.headless = True driver = webdriver.Firefox( options=options, executable_path='/home/sahil/Downloads/geckodriver') driver.get('https://www.moneycontrol.com/india/stockpricequote/') print(driver.current_url) tbox = driver.find_element_by_xpath('//*[@id="company"]') tbox.send_keys(self.name) btxpath = "div.MT2:nth-child(1) > input:nth-child(2)" flag = 0 c = 0 while (flag == 0): try: driver.find_element_by_css_selector(btxpath).click() flag = 1 except Exception as e: print(e) c = c + 1 if (c == 3): flag = 1 print('sleeping') time.sleep(5) print(driver.current_url) hpxpath = "Historical Prices" flag = 0 c = 0 while (flag == 0): try: driver.find_element_by_link_text(hpxpath).click() flag = 1 except Exception as e: print(e) c = c + 1 print('sleeping') if (c == 3): flag = 1 time.sleep(5) pg = bsoap(driver.page_source, 'html.parser') driver.get(pg.find('a', title='Click Here')['href']) nse1 = Select(driver.find_element_by_css_selector('#ex')) if (self.ch == 'd'): nse1.select_by_visible_text('NSE') nse = Select(driver.find_element_by_name('frm_dy')) nse.select_by_visible_text('01') nse = Select(driver.find_element_by_name('frm_mth')) nse.select_by_visible_text('Mar') nse = Select(driver.find_element_by_name('frm_yr')) nse.select_by_visible_text('2008') nse = Select(driver.find_element_by_name('to_dy')) nse.select_by_visible_text('01') nse = Select(driver.find_element_by_name('to_mth')) nse.select_by_visible_text('Mar') nse = Select(driver.find_element_by_name('to_yr')) nse.select_by_visible_text('2019') p = driver.find_element_by_css_selector( 'td.PL20:nth-child(1) > form:nth-child(1) > div:nth-child(4) > input:nth-child(4)' ) p.click() else: nse = Select(driver.find_element_by_name('mth_frm_mth')) nse.select_by_visible_text('Mar') nse = Select(driver.find_element_by_name('mth_frm_yr')) nse.select_by_visible_text('2000') nse = Select(driver.find_element_by_name('mth_to_mth')) nse.select_by_visible_text('Mar') nse = Select(driver.find_element_by_name('mth_to_yr')) nse.select_by_visible_text('2019') p = driver.find_element_by_css_selector( 'td.PT15:nth-child(3) > form:nth-child(1) > div:nth-child(4) > input:nth-child(3)' ) p.click() k = [] while (True): pg = bsoap(driver.page_source, 'html.parser') tab = pg.find('table', class_='tblchart') k.append(pd.read_html(str(tab))) url = str(driver.current_url.encode('ascii')) url = url[0:url.find('?')] elem = pg.find_all('a', class_='nextprev') if (len(elem) == 0): break url1 = elem[0]['href'].encode('ascii') url = url + url1 driver.get(url) print('next') print(k) driver.quit() return k
def fin_data(i, driver): try: print('in here') print(driver.current_url) driver.find_element_by_xpath('//*[@id="company"]').send_keys(i) bt = driver.find_element_by_css_selector( 'div.MT2:nth-child(1) > input:nth-child(2)') bt.click() print(driver.current_url) ct = 0 while (1): if (ct >= 3): break try: driver_act(driver) break except Exception as e: print(e) ct = ct + 1 time.sleep(10) pg1 = bsoap(driver.page_source, 'html.parser') title = pg1.find('h1', class_='pcstname').text rat = pg1.find('a', title='Ratios') driver.get(rat['href']) bse, nse = ext_symb(pg1) flag = 0 flag1 = 1 k = [] while (flag == 0): try: print('on') url1 = driver.current_url print(url1) k1 = pd.read_html(url1, header=0)[0] k.append(k1) print(len(k1)) k1['title'] = title k1['NSE'] = '' k1['BSE'] = '' if len(bse) != 0: k1['BSE'] = bse[0] if len(nse) != 0: k1['NSE'] = nse[0] print(nse[0]) pg = bsoap(driver.page_source, 'html.parser') btx = driver.find_element_by_xpath( '//*[@id="mc_content"]/div[2]/div/div[2]/ul/li[2]/a') driver.execute_script("arguments[0].click();", btx) if (driver.current_url.encode('ascii') == url1.encode('ascii') ): print('reach') flag = 1 except Exception as e: print(e) flag1 = 0 flag = 1 if (flag1 == 1): k = pd.concat(k, axis=1) df = k df = df.loc[:, ~df.columns.duplicated()] ext().store_file('/usr/share/app', 'finalkfr1', df) except Exception as e: print(e) return print(i) last_symb = nse[0] print(nse[0]) return k
def dailydata(s, driver, date1, date2): def ext_date(datez, month): dt = datetime.strptime(datez, '%Y-%m-%d').date() day = datez[8:10] month = month[int(datez[5:7]) - 1] year = datez[0:4] return day, month, year day_from, month_from, year_from = ext_date(date1, month) day_to, month_to, year_to = ext_date(date2, month) print(driver.current_url) tbox = driver.find_element_by_xpath('//*[@id="company"]') tbox.send_keys(s) btxpath = "div.MT2:nth-child(1) > input:nth-child(2)" flag = 0 c = 0 while (flag == 0): try: driver.find_element_by_css_selector(btxpath).click() flag = 1 except Exception as e: print(e) c = c + 1 if (c == 3): flag = 1 print('sleeping') time.sleep(5) print(driver.current_url) hpxpath = "Historical Prices" flag = 0 c = 0 flag1 = 0 while (flag == 0): try: driver.find_element_by_link_text(hpxpath).click() flag = 1 except Exception as e: print(e) c = c + 1 print('sleeping') if (c == 3): flag = 1 flag1 = 1 time.sleep(5) if (flag1): return pd.DataFrame() pg = bsoap(driver.page_source, 'html.parser') driver.get(pg.find('a', title='Click Here')['href']) nse1 = Select(driver.find_element_by_css_selector('#ex')) if (ch == 'd'): nse1.select_by_visible_text('NSE') nse = Select(driver.find_element_by_name('frm_dy')) nse.select_by_visible_text(str(day_from)) nse = Select(driver.find_element_by_name('frm_mth')) nse.select_by_visible_text(str(month_from)) nse = Select(driver.find_element_by_name('frm_yr')) nse.select_by_visible_text(str(year_from)) nse = Select(driver.find_element_by_name('to_dy')) nse.select_by_visible_text(str(day_to)) nse = Select(driver.find_element_by_name('to_mth')) nse.select_by_visible_text(str(month_to)) nse = Select(driver.find_element_by_name('to_yr')) nse.select_by_visible_text(str(year_to)) p = driver.find_element_by_css_selector( '#mc_mainWrapper > div.PA10 > div > div.PT15 > div.PT10 > div.brdb > table > tbody > tr > td:nth-child(1) > form > div:nth-child(4) > input[type="image"]:nth-child(4)' ) p.click() else: nse = Select(driver.find_element_by_name('mth_frm_mth')) nse.select_by_visible_text('Mar') nse = Select(driver.find_element_by_name('mth_frm_yr')) nse.select_by_visible_text('2000') nse = Select(driver.find_element_by_name('mth_to_mth')) nse.select_by_visible_text('Mar') nse = Select(driver.find_element_by_name('mth_to_yr')) nse.select_by_visible_text('2019') p = driver.find_element_by_css_selector( 'td.PT15:nth-child(3) > form:nth-child(1) > div:nth-child(4) > input:nth-child(3)' ) p.click() k = [] flag = 0 while (True): try: pg = bsoap(driver.page_source, 'html.parser') tab = pg.find_all('table', class_='tblchart')[0] k.append(pd.read_html(str(tab))) #k.append(pd.read_html(driver.current_url,attrs={'class':'tblchart'})) #url = str(driver.current_url.encode('ascii')) url = driver.current_url url = url[0:url.find('?')] elem = pg.find_all('a', class_='nextprev') flag = 1 if (len(elem) == 0): break # url1 = str(elem[0]['href'].encode('ascii')) #url1 = elem[0]['href'].decode('utf-8') url1 = elem[0]['href'] url = url + url1 driver.get(url) print(k) print('next') except Exception as e: print(e) flag = 1 print(k) k = pd.concat(k[0:][0]) k['id'] = s with open('daily_data', 'a+b') as d: pck.dump(k, d) driver.quit() return k
def dailydata(s): driver.get('https://www.moneycontrol.com/india/stockpricequote/') print(driver.current_url) tbox = driver.find_element_by_xpath('//*[@id="company"]') tbox.send_keys(s) btxpath = "div.MT2:nth-child(1) > input:nth-child(2)" flag=0 c=0 while(flag==0): try: driver.find_element_by_css_selector(btxpath).click() flag=1 except Exception as e: print(e) c=c+1 if(c==3): flag=1 print('sleeping') time.sleep(5) print(driver.current_url) hpxpath= "Historical Prices" flag=0 c=0 while(flag==0): try: driver.find_element_by_link_text(hpxpath).click() flag=1 except Exception as e: print(e) c=c+1 print('sleeping') if(c==3): flag=1 time.sleep(5) pg = bsoap(driver.page_source,'html.parser') driver.get(pg.find('a',title='Click Here')['href']) nse1 = Select(driver.find_element_by_css_selector('#ex')) if(ch=='d'): nse1.select_by_visible_text('NSE') nse = Select(driver.find_element_by_name('frm_dy')) nse.select_by_visible_text('01') nse = Select(driver.find_element_by_name('frm_mth')) nse.select_by_visible_text('Mar') nse = Select(driver.find_element_by_name('frm_yr')) nse.select_by_visible_text('2000') nse = Select(driver.find_element_by_name('to_dy')) nse.select_by_visible_text('01') nse = Select(driver.find_element_by_name('to_mth')) nse.select_by_visible_text('Mar') nse = Select(driver.find_element_by_name('to_yr')) nse.select_by_visible_text('2020') p = driver.find_element_by_css_selector('#mc_mainWrapper > div.PA10 > div > div.PT15 > div.PT10 > div.brdb > table > tbody > tr > td:nth-child(1) > form > div:nth-child(4) > input[type="image"]:nth-child(4)') p.click() else: nse =Select(driver.find_element_by_name('mth_frm_mth')) nse.select_by_visible_text('Mar') nse =Select(driver.find_element_by_name('mth_frm_yr')) nse.select_by_visible_text('2000') nse =Select(driver.find_element_by_name('mth_to_mth')) nse.select_by_visible_text('Mar') nse =Select(driver.find_element_by_name('mth_to_yr')) nse.select_by_visible_text('2019') p = driver.find_element_by_css_selector('td.PT15:nth-child(3) > form:nth-child(1) > div:nth-child(4) > input:nth-child(3)') p.click() k=[] flag=0 while(True): try: pg = bsoap(driver.page_source,'html.parser') tab = pg.find_all('table',class_='tblchart')[0] k.append(pd.read_html(str(tab))) #k.append(pd.read_html(driver.current_url,attrs={'class':'tblchart'})) #url = str(driver.current_url.encode('ascii')) url = driver.current_url url = url[0:url.find('?')] elem = pg.find_all('a',class_='nextprev') flag=1 except Exception as e: print(e) if(len(elem)==0): break # url1 = str(elem[0]['href'].encode('ascii')) #url1 = elem[0]['href'].decode('utf-8') url1 = elem[0]['href'] url = url+url1 driver.get(url) print(k) print('next') with open('daily_data','a+b') as d: pck.dump(k,d) flag=1 print(k) k =pd.concat(k[0:][0]) k['id'] = s return k