Example #1
0
def get(company):
    """Performs a HTTP GET for a glassdoor page and returns
    BeautifulSoup with a .json() method
    """
    params = 'clickSource=searchBtn&typedKeyword=&sc.keyword=%s' % company
    r = requests.get('%s?%s' % (GLASSDOOR_API, params))
    soup = BeautifulSoup(r.content)
    soup.json = partial(parse, soup, raw=True)
    soup.data = lambda: json.loads(soup.json())
    return soup
Example #2
0
def get(company):
    """Performs a HTTP GET for a glassdoor page and returns
    BeautifulSoup with a .json() method
    """
    params = 'clickSource=searchBtn&typedKeyword=&sc.keyword=%s' % (company);
    r = requests.get('%s?%s' % (GLASSDOOR_API, params))
    soup = BeautifulSoup(r.content)
    if soup.findAll('div', {'class': 'sortBar'}):
       link = parse_exactMatch(soup);
       r = requests.get('%s%s' % (GLASSDOOR_ROOT, link))     
       soup = BeautifulSoup(r.content)
    soup.json = partial(parse, soup, raw=True)
    soup.data = lambda: json.loads(soup.json())
    return soup
Example #3
0
def metaFromDoi(doi):
    from Bio import Entrez
    from BeautifulSoup import BeautifulSoup
    from chembl_business_model.models import JournalArticles, Docs

    doc_id = None

    meta = {'journal':{'pubDate':{}}, 'authors':[]}

    Entrez.email = settings.ADMINS[0][1]
    handle = Entrez.esearch(db="pubmed", term=str(doi))
    record = BeautifulSoup(handle.read())
    id = str(record.id.getText())
    handle = Entrez.efetch(db="pubmed", id=id, rettype="gb")
    result = BeautifulSoup(handle.read())
    meta['journal']['volume'] = result.volume.getText() if result.volume else ''
    meta['journal']['issue'] = result.issue.getText() if result.issue else ''
    meta['pubmed'] = id
    meta['doi'] = result.elocationid.getText() if result.elocationid else ''
    meta['title'] = result.articletitle.getText() if result.articletitle else ''
    meta['abstract'] = result.abstracttext.getText() if result.abstracttext else ''

    journal = result.journal
    if journal:
        meta['journal']['issn'] = journal.issn.getText() if journal.issn else ''
        meta['journal']['title'] = journal.title.getText() if journal.title else ''
        meta['journal']['ISOAbbreviation'] = journal.isoabbreviation.getText() if journal.isoabbreviation else ''
        pubdate = journal.pubdate
        if pubdate:
            meta['journal']['pubDate']['year'] = pubdate.year.getText() if pubdate.year else ''
            meta['journal']['pubDate']['month'] = pubdate.month.getText() if pubdate.month else ''
            meta['journal']['pubDate']['day'] = pubdate.day.getText() if pubdate.day else ''

    if result.authorlist:
        for i in result.authorlist.childGenerator():
            if i and str(i).strip():
                author = BeautifulSoup(str(i))
                auth = {}
                if author.forename:
                    auth['forename'] = author.forename.getText()
                    auth['lastname'] = author.lastname.getText()
                    auth['initials'] = author.initials.getText()
                    meta['authors'].append(auth)

    try:
        pubmedId = int(doi)
        print 'searching doc of pubmed_id = %s' % pubmedId
        q = Docs.objects.filter(pubmed_id = pubmedId)

    except ValueError:
        print 'searching doc of doi = %s' % doi
        q = Docs.objects.filter(doi__exact = doi)

    if len(q):
        doc_id = q[0].pk
    else:
        print 'searchuin'
        q = Docs.objects.filter(pubmed_id = int(id))
        if len(q):
            doc_id = q[0].pk
        elif meta.get('doi'):
            q = Docs.objects.filter(doi__exact = meta['doi'])
            if len(q):
                doc_id = q[0].pk

    if doc_id:
        doc = q[0]
        journal = doc.journal
        arts = JournalArticles.objects.filter(pk=doc_id)
        art = None
        if len(arts):
            art = arts[0]
        if not meta['journal']['title']:
            meta['journal']['title'] = journal.title if journal else None
        if not meta['journal']['ISOAbbreviation']:
            meta['journal']['ISOAbbreviation'] = journal.iso_abbreviation if journal else None
        if not meta['journal']['issn']:
            meta['journal']['issn'] = journal.issn_print if journal else None
        if not meta['journal']['issn']:
            meta['journal']['issn'] = journal.issn_electronic if journal else None
        meta['journal']['volume'] = doc.volume
        meta['journal']['issue'] = doc.issue
        if not meta['journal']['pubDate']['year']:
            meta['journal']['pubDate']['year'] = art.year if art else None
        if not meta['journal']['pubDate']['month']:
            meta['journal']['pubDate']['month'] = art.month if art else None
        if not meta['journal']['pubDate']['day']:
            meta['journal']['pubDate']['day'] = art.day if art else None
        meta['journal']['pagination'] = art.pagination if art else None
        meta['first_page'] = doc.first_page
        meta['last_page'] = doc.last_page
        if not meta['title']:
            meta['title'] = doc.title
        if not meta['abstract']:
            meta['abstract'] = doc.abstract
        if not meta['authors']:
            meta['authors'] = doc.authors

    meta['doc_id'] = doc_id

    meta['chembl_like'] = "No"

    title =  urlquote(meta['title'])
    abstract  = urlquote(meta['abstract'])
    url = '%sCHEMBLLIKE/%s/%s' % (settings.PIPLINE_PILOT_ENDPOINT, title, abstract)
    try:
        result = requests.get(url, timeout=60)
        status = result.status_code

        if status != 200:
            pass
        else:
            if result.json()["Prediction"]:
                meta['chembl_like'] = "Yes"
    except:
        pass

    return meta
Example #4
0
def metaFromDoi(doi):
    from Bio import Entrez
    from BeautifulSoup import BeautifulSoup
    from chembl_business_model.models import JournalArticles, Docs

    doc_id = None

    meta = {'journal': {'pubDate': {}}, 'authors': []}

    Entrez.email = settings.ADMINS[0][1]
    handle = Entrez.esearch(db="pubmed", term=str(doi))
    record = BeautifulSoup(handle.read())
    id = str(record.id.getText())
    handle = Entrez.efetch(db="pubmed", id=id, rettype="gb")
    result = BeautifulSoup(handle.read())
    meta['journal']['volume'] = result.volume.getText(
    ) if result.volume else ''
    meta['journal']['issue'] = result.issue.getText() if result.issue else ''
    meta['pubmed'] = id
    meta['doi'] = result.elocationid.getText() if result.elocationid else ''
    meta['title'] = result.articletitle.getText(
    ) if result.articletitle else ''
    meta['abstract'] = result.abstracttext.getText(
    ) if result.abstracttext else ''

    journal = result.journal
    if journal:
        meta['journal']['issn'] = journal.issn.getText(
        ) if journal.issn else ''
        meta['journal']['title'] = journal.title.getText(
        ) if journal.title else ''
        meta['journal']['ISOAbbreviation'] = journal.isoabbreviation.getText(
        ) if journal.isoabbreviation else ''
        pubdate = journal.pubdate
        if pubdate:
            meta['journal']['pubDate']['year'] = pubdate.year.getText(
            ) if pubdate.year else ''
            meta['journal']['pubDate']['month'] = pubdate.month.getText(
            ) if pubdate.month else ''
            meta['journal']['pubDate']['day'] = pubdate.day.getText(
            ) if pubdate.day else ''

    if result.authorlist:
        for i in result.authorlist.childGenerator():
            if i and str(i).strip():
                author = BeautifulSoup(str(i))
                auth = {}
                if author.forename:
                    auth['forename'] = author.forename.getText()
                    auth['lastname'] = author.lastname.getText()
                    auth['initials'] = author.initials.getText()
                    meta['authors'].append(auth)

    try:
        pubmedId = int(doi)
        print 'searching doc of pubmed_id = %s' % pubmedId
        q = Docs.objects.filter(pubmed_id=pubmedId)

    except ValueError:
        print 'searching doc of doi = %s' % doi
        q = Docs.objects.filter(doi__exact=doi)

    if len(q):
        doc_id = q[0].pk
    else:
        print 'searchuin'
        q = Docs.objects.filter(pubmed_id=int(id))
        if len(q):
            doc_id = q[0].pk
        elif meta.get('doi'):
            q = Docs.objects.filter(doi__exact=meta['doi'])
            if len(q):
                doc_id = q[0].pk

    if doc_id:
        doc = q[0]
        journal = doc.journal
        arts = JournalArticles.objects.filter(pk=doc_id)
        art = None
        if len(arts):
            art = arts[0]
        if not meta['journal']['title']:
            meta['journal']['title'] = journal.title if journal else None
        if not meta['journal']['ISOAbbreviation']:
            meta['journal'][
                'ISOAbbreviation'] = journal.iso_abbreviation if journal else None
        if not meta['journal']['issn']:
            meta['journal']['issn'] = journal.issn_print if journal else None
        if not meta['journal']['issn']:
            meta['journal'][
                'issn'] = journal.issn_electronic if journal else None
        meta['journal']['volume'] = doc.volume
        meta['journal']['issue'] = doc.issue
        if not meta['journal']['pubDate']['year']:
            meta['journal']['pubDate']['year'] = art.year if art else None
        if not meta['journal']['pubDate']['month']:
            meta['journal']['pubDate']['month'] = art.month if art else None
        if not meta['journal']['pubDate']['day']:
            meta['journal']['pubDate']['day'] = art.day if art else None
        meta['journal']['pagination'] = art.pagination if art else None
        meta['first_page'] = doc.first_page
        meta['last_page'] = doc.last_page
        if not meta['title']:
            meta['title'] = doc.title
        if not meta['abstract']:
            meta['abstract'] = doc.abstract
        if not meta['authors']:
            meta['authors'] = doc.authors

    meta['doc_id'] = doc_id

    meta['chembl_like'] = "No"

    title = urlquote(meta['title'])
    abstract = urlquote(meta['abstract'])
    url = '%sCHEMBLLIKE/%s/%s' % (settings.PIPLINE_PILOT_ENDPOINT, title,
                                  abstract)
    try:
        result = requests.get(url, timeout=60)
        status = result.status_code

        if status != 200:
            pass
        else:
            if result.json()["Prediction"]:
                meta['chembl_like'] = "Yes"
    except:
        pass

    return meta