def get(company): """Performs a HTTP GET for a glassdoor page and returns BeautifulSoup with a .json() method """ params = 'clickSource=searchBtn&typedKeyword=&sc.keyword=%s' % company r = requests.get('%s?%s' % (GLASSDOOR_API, params)) soup = BeautifulSoup(r.content) soup.json = partial(parse, soup, raw=True) soup.data = lambda: json.loads(soup.json()) return soup
def get(company): """Performs a HTTP GET for a glassdoor page and returns BeautifulSoup with a .json() method """ params = 'clickSource=searchBtn&typedKeyword=&sc.keyword=%s' % (company); r = requests.get('%s?%s' % (GLASSDOOR_API, params)) soup = BeautifulSoup(r.content) if soup.findAll('div', {'class': 'sortBar'}): link = parse_exactMatch(soup); r = requests.get('%s%s' % (GLASSDOOR_ROOT, link)) soup = BeautifulSoup(r.content) soup.json = partial(parse, soup, raw=True) soup.data = lambda: json.loads(soup.json()) return soup
def metaFromDoi(doi): from Bio import Entrez from BeautifulSoup import BeautifulSoup from chembl_business_model.models import JournalArticles, Docs doc_id = None meta = {'journal':{'pubDate':{}}, 'authors':[]} Entrez.email = settings.ADMINS[0][1] handle = Entrez.esearch(db="pubmed", term=str(doi)) record = BeautifulSoup(handle.read()) id = str(record.id.getText()) handle = Entrez.efetch(db="pubmed", id=id, rettype="gb") result = BeautifulSoup(handle.read()) meta['journal']['volume'] = result.volume.getText() if result.volume else '' meta['journal']['issue'] = result.issue.getText() if result.issue else '' meta['pubmed'] = id meta['doi'] = result.elocationid.getText() if result.elocationid else '' meta['title'] = result.articletitle.getText() if result.articletitle else '' meta['abstract'] = result.abstracttext.getText() if result.abstracttext else '' journal = result.journal if journal: meta['journal']['issn'] = journal.issn.getText() if journal.issn else '' meta['journal']['title'] = journal.title.getText() if journal.title else '' meta['journal']['ISOAbbreviation'] = journal.isoabbreviation.getText() if journal.isoabbreviation else '' pubdate = journal.pubdate if pubdate: meta['journal']['pubDate']['year'] = pubdate.year.getText() if pubdate.year else '' meta['journal']['pubDate']['month'] = pubdate.month.getText() if pubdate.month else '' meta['journal']['pubDate']['day'] = pubdate.day.getText() if pubdate.day else '' if result.authorlist: for i in result.authorlist.childGenerator(): if i and str(i).strip(): author = BeautifulSoup(str(i)) auth = {} if author.forename: auth['forename'] = author.forename.getText() auth['lastname'] = author.lastname.getText() auth['initials'] = author.initials.getText() meta['authors'].append(auth) try: pubmedId = int(doi) print 'searching doc of pubmed_id = %s' % pubmedId q = Docs.objects.filter(pubmed_id = pubmedId) except ValueError: print 'searching doc of doi = %s' % doi q = Docs.objects.filter(doi__exact = doi) if len(q): doc_id = q[0].pk else: print 'searchuin' q = Docs.objects.filter(pubmed_id = int(id)) if len(q): doc_id = q[0].pk elif meta.get('doi'): q = Docs.objects.filter(doi__exact = meta['doi']) if len(q): doc_id = q[0].pk if doc_id: doc = q[0] journal = doc.journal arts = JournalArticles.objects.filter(pk=doc_id) art = None if len(arts): art = arts[0] if not meta['journal']['title']: meta['journal']['title'] = journal.title if journal else None if not meta['journal']['ISOAbbreviation']: meta['journal']['ISOAbbreviation'] = journal.iso_abbreviation if journal else None if not meta['journal']['issn']: meta['journal']['issn'] = journal.issn_print if journal else None if not meta['journal']['issn']: meta['journal']['issn'] = journal.issn_electronic if journal else None meta['journal']['volume'] = doc.volume meta['journal']['issue'] = doc.issue if not meta['journal']['pubDate']['year']: meta['journal']['pubDate']['year'] = art.year if art else None if not meta['journal']['pubDate']['month']: meta['journal']['pubDate']['month'] = art.month if art else None if not meta['journal']['pubDate']['day']: meta['journal']['pubDate']['day'] = art.day if art else None meta['journal']['pagination'] = art.pagination if art else None meta['first_page'] = doc.first_page meta['last_page'] = doc.last_page if not meta['title']: meta['title'] = doc.title if not meta['abstract']: meta['abstract'] = doc.abstract if not meta['authors']: meta['authors'] = doc.authors meta['doc_id'] = doc_id meta['chembl_like'] = "No" title = urlquote(meta['title']) abstract = urlquote(meta['abstract']) url = '%sCHEMBLLIKE/%s/%s' % (settings.PIPLINE_PILOT_ENDPOINT, title, abstract) try: result = requests.get(url, timeout=60) status = result.status_code if status != 200: pass else: if result.json()["Prediction"]: meta['chembl_like'] = "Yes" except: pass return meta
def metaFromDoi(doi): from Bio import Entrez from BeautifulSoup import BeautifulSoup from chembl_business_model.models import JournalArticles, Docs doc_id = None meta = {'journal': {'pubDate': {}}, 'authors': []} Entrez.email = settings.ADMINS[0][1] handle = Entrez.esearch(db="pubmed", term=str(doi)) record = BeautifulSoup(handle.read()) id = str(record.id.getText()) handle = Entrez.efetch(db="pubmed", id=id, rettype="gb") result = BeautifulSoup(handle.read()) meta['journal']['volume'] = result.volume.getText( ) if result.volume else '' meta['journal']['issue'] = result.issue.getText() if result.issue else '' meta['pubmed'] = id meta['doi'] = result.elocationid.getText() if result.elocationid else '' meta['title'] = result.articletitle.getText( ) if result.articletitle else '' meta['abstract'] = result.abstracttext.getText( ) if result.abstracttext else '' journal = result.journal if journal: meta['journal']['issn'] = journal.issn.getText( ) if journal.issn else '' meta['journal']['title'] = journal.title.getText( ) if journal.title else '' meta['journal']['ISOAbbreviation'] = journal.isoabbreviation.getText( ) if journal.isoabbreviation else '' pubdate = journal.pubdate if pubdate: meta['journal']['pubDate']['year'] = pubdate.year.getText( ) if pubdate.year else '' meta['journal']['pubDate']['month'] = pubdate.month.getText( ) if pubdate.month else '' meta['journal']['pubDate']['day'] = pubdate.day.getText( ) if pubdate.day else '' if result.authorlist: for i in result.authorlist.childGenerator(): if i and str(i).strip(): author = BeautifulSoup(str(i)) auth = {} if author.forename: auth['forename'] = author.forename.getText() auth['lastname'] = author.lastname.getText() auth['initials'] = author.initials.getText() meta['authors'].append(auth) try: pubmedId = int(doi) print 'searching doc of pubmed_id = %s' % pubmedId q = Docs.objects.filter(pubmed_id=pubmedId) except ValueError: print 'searching doc of doi = %s' % doi q = Docs.objects.filter(doi__exact=doi) if len(q): doc_id = q[0].pk else: print 'searchuin' q = Docs.objects.filter(pubmed_id=int(id)) if len(q): doc_id = q[0].pk elif meta.get('doi'): q = Docs.objects.filter(doi__exact=meta['doi']) if len(q): doc_id = q[0].pk if doc_id: doc = q[0] journal = doc.journal arts = JournalArticles.objects.filter(pk=doc_id) art = None if len(arts): art = arts[0] if not meta['journal']['title']: meta['journal']['title'] = journal.title if journal else None if not meta['journal']['ISOAbbreviation']: meta['journal'][ 'ISOAbbreviation'] = journal.iso_abbreviation if journal else None if not meta['journal']['issn']: meta['journal']['issn'] = journal.issn_print if journal else None if not meta['journal']['issn']: meta['journal'][ 'issn'] = journal.issn_electronic if journal else None meta['journal']['volume'] = doc.volume meta['journal']['issue'] = doc.issue if not meta['journal']['pubDate']['year']: meta['journal']['pubDate']['year'] = art.year if art else None if not meta['journal']['pubDate']['month']: meta['journal']['pubDate']['month'] = art.month if art else None if not meta['journal']['pubDate']['day']: meta['journal']['pubDate']['day'] = art.day if art else None meta['journal']['pagination'] = art.pagination if art else None meta['first_page'] = doc.first_page meta['last_page'] = doc.last_page if not meta['title']: meta['title'] = doc.title if not meta['abstract']: meta['abstract'] = doc.abstract if not meta['authors']: meta['authors'] = doc.authors meta['doc_id'] = doc_id meta['chembl_like'] = "No" title = urlquote(meta['title']) abstract = urlquote(meta['abstract']) url = '%sCHEMBLLIKE/%s/%s' % (settings.PIPLINE_PILOT_ENDPOINT, title, abstract) try: result = requests.get(url, timeout=60) status = result.status_code if status != 200: pass else: if result.json()["Prediction"]: meta['chembl_like'] = "Yes" except: pass return meta