def pub_data(): f = open('pubmedcat.txt','r') f = f.read().split('\n') s=[] for j in f: results = search_query(j) id_list = results['IdList'] papers = fetch_details(id_list) for i in papers['PubmedArticle']: s.append(json.dumps(i, indent=2, separators=(',', ':'))) author=[] title = [] date = [] types = [] source = [] site = [] url = [] ref = [] pdf_url = [] abstract = [] for le in range(len(s)): data =json.loads(s[le])['MedlineCitation']['Article'] try: url.append('https://pubmed.ncbi.nlm.nih.gov/'+json.loads(s[le])['MedlineCitation']['PMID']) except: url.append(None) try: abstract.append(data['Abstract']['AbstractText'][0]) except: abstract.append(None) try: pdf_url.append('http://doi.org/'+data['ELocationID'][0]) except: pdf_url.append(None) try: site.append('pubmed') except: site.append(None) try: issn = 'ISSN: '+ data['Journal']['ISSN'] tit = data['Journal']['Title'] vol = 'volume'+' '+ data['Journal']['JournalIssue']['Volume'] yr = data['Journal']['JournalIssue']['PubDate']['Year'] ref.append(tit+'('+issn+')'+','+vol+'('+yr+')') except: ref.append(None) try: source.append(data['Journal']['Title']) except: source.append(None) try: types.append('academic') except: types.append(None) try: d = json.loads(s[le])['MedlineCitation']['DateCompleted']['Year']+'-'+json.loads(s[le])['MedlineCitation']['DateCompleted']['Month']+'-'+json.loads(s[le])['MedlineCitation']['DateCompleted']['Day'] date.append(d) except: date.append(None) try: title.append(data['ArticleTitle']) except: title.append(None) try: aut = data['AuthorList'] a='' for i in aut: a = a+(i['ForeName']+' '+i['LastName'] ) a=a+', ' author.append(a[:-2]) except: author.append(None) df = pd.DataFrame({'Authors':author,'Title':title,'Date':date, 'Types':types, 'Source':source, 'Site':site, 'Url':url, 'Ref':ref, 'Pdf_url':pdf_url, 'Abstract':abstract}) df = df.where(pd.notnull(df), np.nan) for i in df.index: try: t = pd.DataFrame() t =t.append(df.loc[i]) t.reset_index(drop=True, inplace=True) try: count = search(t.loc[0]['Title'],t.loc[0]['Site']) print(count) if count < 25 : test =t.loc[0].to_json() send_data(test,t.loc[0]['Site']) print('Data sent') else: print('Skipped') except: test =t.loc[0].to_json() send_data(test,t.loc[0]['Site']) except Exception as e: print(e) print('info fetched')
def grab_data(): response = requests.get(url) soup = BeautifulSoup(response.text, 'html.parser') links_unread = soup.find_all( 'div', {'class': 'post-block post-block--image post-block--unread'}) links_read = soup.find_all( 'div', {'class': 'post-block post-block--image post-block--read'}) var = [] for i in links_unread: var.append(i) if len(links_read) != 0: for i in links_read: var.append(i) headline = [] timestamp = [] AUTHORS = [] SUMMARY = [] date_crawled = [] news_source = [] full = [] for i in range(len(var)): title = var[i].find('a', {'class': 'post-block__title__link'}).text title = title.replace('\n', '') title = title.replace('\t', '') headline.append(title) time = var[i].find('time', {'class': 'river-byline__time'}).text time = time.replace('\n', '') time = time.replace('\t', '') timestamp.append(time) author = var[i].find('span', {'class': 'river-byline__authors'}) author = author.find_all('a') string = '' for j in author: string = string + j.text if len(author) > 1: string = string + ', ' if len(author) > 1: string = string[:-2] author = string author = author.replace('\n', '') author = author.replace('\t', '') AUTHORS.append(author) summary = var[i].find('div', {'class': 'post-block__content'}).text summary = summary.replace('\n', '') summary = summary.replace('\t', '') SUMMARY.append(summary) date_craw = str(datetime.datetime.today().date()) date_crawled.append(date_craw) source = 'https://techcrunch.com/' news_source.append(source) full_article_url = var[i].find( 'a', {'class': 'post-block__title__link'})['href'] data = requests.get(full_article_url) soup = BeautifulSoup(data.text, 'html.parser') result = soup.find('div', {'class': 'article-content'}) full_text = result.text full.append(full_text) final = pd.DataFrame({ 'Title': headline, 'Author': AUTHORS, 'Summary': SUMMARY, 'full_text': full, 'date_published': timestamp, 'date_crawled': date_crawled, 'news_source': news_source }) for i in final.index: try: t = pd.DataFrame() t = t.append(final.loc[i]) t.reset_index(drop=True, inplace=True) try: count = search(t.loc[0]['Title']) print(count) if count < 25: test = t.loc[0].to_json() send_data(test) print('Data sent') else: print('Skipped') except: test = t.loc[0].to_json() send_data(test) except Exception as e: print(e) start()
def grab_data(): asli = [] for j in websites: response = requests.get(j) soup = BeautifulSoup(response.text,'html.parser') url = soup.find_all('a') for i in range(len(url)) : try: url[i] = url[i]['href'] except: try: url.remove(url[i]) except: pass var=[] for i in url: if i not in var: var.append(i) url = var try: f = open('urlparsed.txt','r') already_parsed = f.read().split('\n') f.close() except: f = open('urlparsed.txt','w') for i in url: try: i =i['href'] except: pass f.write(str(i)) f.write('\n') f.close() try: for i in already_parsed: try: url.remove(i) except: pass for i in url: already_parsed.append(i) f = open('urlparsed.txt','w') for i in already_parsed: try: i =i['href'] except: pass f.write(str(i)) f.write('\n') f.close() except: pass for i in url: try : try: i =i['href'] except: pass if 'https' not in i: if 'http' not in i: i = j+i #print('\n',i) response = requests.get(i,timeout=10) details = newspaper(i) count = len(details.article) publish_date = details.date_publish cr_date = details.date_download description = details.description summary = details.summary category = details.category if count > 1500: if len(description) > 10 or len(summary) > 10: #print("Appended") asli.append(i) else: pass else: pass except: pass headline=[] timestamp=[] AUTHORS =[] SUMMARY=[] date_crawled = [] news_source = [] full = [] img_url = [] keywords=[] url_news=[] types = [] for i in asli: try: chrome_options = Options() chrome_options.add_argument('--headless') chrome_options.add_argument('--no-sandbox') chrome_options.add_argument('--disable-dev-shm-usage') driver = webdriver.Chrome('/usr/bin/chromedriver',chrome_options = chrome_options) driver.get(i) details = newspaper(i) if 'bbc' in i: news_source.append('bbc') elif 'techcrunch' in i: news_source.append('techcrunch') elif 'theguardian' in i: news_source.append('theguardian') elif 'voanews' in i: news_source.append('voanews') elif 'abc.net' in i: news_source.append('abc') headline.append(details.headline) timestamp.append(details.date_publish) url_news.append(i) types.append('newspaper') author='' for i in details.authors: author = author + i author =author + ', ' author= author[:-2] AUTHORS.append(author) keyword='' for i in details.keywords: keyword = keyword + i keyword =keyword + ', ' keyword= keyword[:-2] keywords.append(keyword) if len(details.summary) > 10: SUMMARY.append(details.summary) else: SUMMARY.append(details.description) date_crawled.append(details.date_download) full.append(details.article) try: re = driver.find_elements_by_tag_name('img') for i in re: if'.jpg' in i.get_attribute('src'): im = i.get_attribute('src') break; if len(im)>3: img_url.append(im) else: img_url.append(None) except: img_url.append(None) # print('Done inside') driver.close() except: try: driver.close() except: pass pass final = pd.DataFrame({'Title':headline,'Author':AUTHORS,'Summary':SUMMARY, 'full_text':full,'date_published':timestamp, 'date_crawled':date_crawled, 'news_source':news_source,'img':img_url,'keywords':keywords,'url_news':url_news,'Types':types}) for i in final.index: try: t = pd.DataFrame() t =t.append(final.loc[i]) t.reset_index(drop=True, inplace=True) try: count = search(t.loc[0]['Title'],t.loc[0]['news_source']) #print(count) if count < 25 or count==None : test =t.loc[0].to_json() send_data(test,t.loc[0]['news_source']) #print('Data sent') else: pass #print('Skipped') except: test =t.loc[0].to_json() send_data(test,t.loc[0]['news_source']) except Exception as e: pass
def grab(test, count): headline = [] timestamp = [] AUTHORS = [] SUMMARY = [] date_crawled = [] news_source = [] full = [] for k in range(0, len(test)): try: driver = webdriver.Chrome( '/home/priyanshu/project 10/chromedriver') driver.get(test[k]) try: title = driver.find_element_by_tag_name('h1').text author = driver.find_elements_by_class_name('tone-colour') a = '' for i in author: if len(author) < 2: a = a + i.text else: a = a + i.text a = a + ', ' if len(author) < 2: a = a[:-2] author = a time = driver.find_element_by_class_name( 'content__dateline').text.split('\n')[0] try: soup = BeautifulSoup(driver.page_source, 'html.parser') content = soup.find( 'div', { 'class': 'content__article-body from-content-api js-article__body' }).text replace = soup.find( 'div', { 'class': 'after-article js-after-article' }).next.next.text replacement = soup.find_all('aside') for i in replacement: content = content.replace(i.text, '') content = content.replace( soup.find('div', { 'class': 'submeta' }).text, '') content = content.replace(replace, '') summary = soup.find('div', { 'class': 'content__standfirst' }).text if len(summary) > 300: summary = summary[:300] except: summary = soup.find('div', { 'class': 'content__standfirst' }).text content = summary if len(summary) > 300: summary = summary[:300] headline.append(title) AUTHORS.append(author) timestamp.append(time) SUMMARY.append(summary) full.append(content) date = str(datetime.datetime.today().date()) date_crawled.append(date) source = 'https://www.theguardian.com/' news_source.append(source) driver.close() except: try: title = driver.find_element_by_tag_name('h1').text soup = BeautifulSoup(driver.page_source, 'html.parser') author = soup.find_all('address', {'aria-label': 'Contributor info'}) a = '' for i in author: if len(author) < 2: a = a + i.text else: a = a + i.text a = a + ', ' if len(author) < 2: a = a[:-2] author = a time = soup.find('div', { 'class': 'css-1kkxezg' }).text.replace( soup.find('span', { 'class': 'css-nyo8hb' }).text, '') try: soup = BeautifulSoup(driver.page_source, 'html.parser') content = soup.find('div', { 'class': 'article-body-commercial-selector css-79elbk' }).text replace = soup.find('section', { 'class': 'css-q5digb' }).text replacement = soup.find_all('div', {'class': 'css-wz7t6r'}) for i in replacement: content = content.replace(i.text, '') content = content.replace( soup.find('div', { 'class': 'css-739uag' }).text, '') content = content.replace(replace, '') summary = soup.find('div', { 'class': 'css-12nmdsr' }).text if len(summary) > 300: summary = summary[:300] except: summary = soup.find( 'div', { 'class': 'content__standfirstcss-12nmdsr' }).text content = summary if len(summary) > 300: summary = summary[:300] if title not in headline: headline.append(title) AUTHORS.append(author) timestamp.append(time) SUMMARY.append(summary) full.append(content) date = str(datetime.datetime.today().date()) date_crawled.append(date) source = 'https://www.theguardian.com/' news_source.append(source) driver.close() except: print('Passed: ', test[k]) driver.close() except Exception as err: print(err) driver.close() pass final = pd.DataFrame({ 'Title': headline, 'Author': AUTHORS, 'Summary': SUMMARY, 'full_text': full, 'date_published': timestamp, 'date_crawled': date_crawled, 'news_source': news_source }) for i in final.index: try: t = pd.DataFrame() t = t.append(final.loc[i]) t.reset_index(drop=True, inplace=True) try: count = search(t.loc[0]['Title']) print(count) if count < 25: test = t.loc[0].to_json() send_data(test) print('Data sent') else: print('Skipped') except: test = t.loc[0].to_json() send_data(test) print('Data sent') except Exception as e: print(e)
def axir_data(): f = open('categories.txt', 'r') f = f.read().split('\n') for i in range(0, len(f)): f[i] = f[i].split('\t') for j in range(f[i].count('')): try: f[i].remove('') except: pass #print (f) output = [] for i in f: print(i[1]) scraper = arxivscraper.Scraper( category=i[1], date_from=str( (datetime.datetime.now() - datetime.timedelta(1)).date()), date_until=str(datetime.datetime.now().date())) output.append(scraper.scrape()) cols = ('id', 'title', 'categories', 'abstract', 'doi', 'created', 'updated', 'authors') df = pd.DataFrame([], columns=cols) for i in output: try: df = df.append(pd.DataFrame(i, columns=cols)) except: pass df.reset_index(drop=True, inplace=True) df = df.rename(columns={'abstract': 'Abstract'}) df = df.rename(columns={'created': 'Date'}) df = df.rename(columns={'title': 'Title'}) df['Types'] = 'academic' df['Site'] = 'arxiv' df['Source'] = None for i in range(len(df.authors)): a = '' for j in df.authors[i]: a = a + j a = a + ', ' a = a[:-2] df.authors[i] = a df = df.rename(columns={'authors': 'Authors'}) var = [] for i in range(len(df.id)): u = 'https://arxiv.org/abs/' var.append(u + df.id[i]) for i in range(len(df.doi)): df.doi[i] = 'http://doi.org/' + df.doi[i] df = df.rename(columns={'doi': 'Ref'}) df['Url'] = var var = [] for i in range(len(df.id)): u = 'https://arxiv.org/pdf/' var.append(u + df.id[i]) df['Pdf_url'] = var #print (df) df = df.where(pd.notnull(df), np.nan) for i in df.index: try: t = pd.DataFrame() t = t.append(df.loc[i]) t.reset_index(drop=True, inplace=True) try: count = search(t.loc[0]['Title'], t.loc[0]['Site']) print(count) if count < 25: test = t.loc[0].to_json() send_data(test, t.loc[0]['Site']) print('Data sent') else: print('Skipped') except: test = t.loc[0].to_json() send_data(test, t.loc[0]['Site']) except Exception as e: print(e) print('info fetched')