Ejemplo n.º 1
0
def pull_ecolog(unseen=True):
    # unseen = False
    subject_tag = "ECOLOG"
    msg_ids = query_msg_ids(subject_tag=subject_tag, unseen=unseen)
    # msg_ids = query_msg_ids(subject_tag, unseen = False)
    # msg_ids = msg_ids[10:20]

    subject_list = []
    body_list = []
    url_list = []

    global id
    for id in msg_ids:
        # id = msg_ids[2]
        # np.where(np.array(msg_ids) == 33)
        # print(id)
        msg_raw = pull_msg(id)
        # if id == 959:
        #     breakpoint()
        msg_subject, msg_body, msg_url = pull_msg_content_ecolog(msg_raw)

        subject_list.append(msg_subject[0])
        body_list.append(msg_body[0])
        url_list.append(msg_url)

    res = pd.DataFrame({
        'subject': subject_list,
        'body': body_list,
        'url': url_list
    })
    res['source'] = '[ECOLOG]'
    res['subject'] = res['subject'] + ' | ' + \
        res['url'] + ' | ' + res['source']

    if len(res) > 0:
        res = utils.filter_limno(res)

    return res


## agu
# sender = "*****@*****.**"
# msg_ids = query_msg_ids(sender = sender, unseen = False)
# subject_list = []
# body_list = []
# url_list = []
## for id in msg_ids:
# id = msg_ids[1]
# print(id)
# msg_raw = pull_msg(id)
# msg_subject, msg_body, msg_url = pull_msg_content_agu(msg_raw)

# subject_list.append(msg_subject[0])
# body_list.append(msg_body[0])
# url_list.append(msg_url)
Ejemplo n.º 2
0
def pull_earthenvscience():
    data_start_row = 7
    sh = gc.open_by_url(
        "https://docs.google.com/spreadsheets/d/16Qcgpe3_zx3EOCXe5vElev22OhiGtlI2YkukLHfNWf0/edit#gid=1017187727"
    )

    dt_raw = sh.values_batch_get([
        "Faculty/Permanent Jobs!H8:H",  # rank
        "Faculty/Permanent Jobs!F8:F",  # subject
        "Faculty/Permanent Jobs!B8:B",  # institution
        "Faculty/Permanent Jobs!M8:M",  # notes
        "Faculty/Permanent Jobs!C8:C",  # closing_date
        "Faculty/Permanent Jobs!E8:E",  # url
    ])

    dt = pd.DataFrame({
        "rank": dt_raw["valueRanges"][0]["values"],
        "subject": dt_raw["valueRanges"][1]["values"],
        "institution": dt_raw["valueRanges"][2]["values"],
        # "notes": dt_raw['valueRanges'][3]['values'],
        "closing_date": dt_raw["valueRanges"][4]["values"],
        "url": dt_raw["valueRanges"][5]["values"],
    })
    dt = dt.applymap(lambda x: "".join(map(str, x)))
    dt["source"] = "[earthenvscience]"

    # remove closing_date < today
    dt = dt.loc[dt["closing_date"].str.len() > 0]
    dt = dt.loc[dt["closing_date"].str.len() < 15]
    dt["closing_date"] = pd.to_datetime(dt["closing_date"])
    dt = dt.loc[dt["closing_date"] >= pd.to_datetime(datetime.date.today())]

    subject = (dt["rank"] + " | " + dt["subject"] + " | " + dt["institution"] +
               " | " + dt["url"] + " | " + dt["source"])
    body = ""

    res = pd.DataFrame({
        "subject": subject,
        "body": body,
        "source": dt["source"]
    })

    res = utils.filter_limno(res)

    return res


# pull_ecophys
Ejemplo n.º 3
0
def pull_pangeo():
    pangeo_baseurl = 'https://discourse.pangeo.io/c/news/jobs/14'

    response = requests.get(pangeo_baseurl)
    soup = BeautifulSoup(response.text, 'html.parser')
    individual_pages = soup.findAll(
        'a', attrs={'class': 'title raw-link raw-topic-link'})

    detect_pangeo_url = lambda x: re.findall(r'(?<=href=").*(?=">)', x)
    pangeo_urls = [detect_pangeo_url(str(x))[0] for x in individual_pages]
    url_list = pangeo_urls[1:]  # exclude the about page

    detect_pangeo_subject = lambda x: re.findall(r'(?<=>).*(?=<\/a\>)', x)
    pangeo_subjects = [
        detect_pangeo_subject(str(x))[0] for x in individual_pages
    ]
    subject_list = pangeo_subjects[1:]

    body_list = []
    for url in url_list:
        # url = url_list[0]
        # print(url)
        response = requests.get(url)
        soup = BeautifulSoup(response.text, 'html.parser')
        msg_body = soup.find_all(['p', 'td'])
        if len(msg_body) == 0:
            msg_body = "error"
        body_list.append(str(msg_body))

    res = pd.DataFrame({
        'subject': subject_list,
        'body': body_list,
        'url': url_list
    })
    res['source'] = '[PANGEO]'
    res['subject'] = res['subject'] + ' | ' + \
        res['url'] + ' | ' + res['source']

    if len(res) > 0:
        res = utils.filter_limno(res)

    return res
Ejemplo n.º 4
0
def pull_rss():
    rawrss = pd.read_csv(pkg_resources.resource_filename('limnojobs',
                                                            'rss.csv'))
    # sort rawrss by increasing journal name nchar length for pretty printing
    rawrss.index = rawrss['title'].str.len()
    rawrss = rawrss.sort_index().reset_index(drop = True)

    # iterate here
    posts = pd.DataFrame(columns=['subject', 'body', 'url'])
    for i in range(len(rawrss.index)):
        # i = 0
        res_raw = pull_feed(rawrss['rawrss'][i], rawrss['title'][i])
        res = filter_limno(res_raw).reset_index()
        res['source'] = rawrss['title'][i]
        posts = posts.append(res, ignore_index=True)
    
    posts['subject'] = posts['subject'] + ' | ' + \
        posts['url'] + ' | ' + posts['source']

    return posts
Ejemplo n.º 5
0
def pull_rse():
    rse_baseurl = 'https://us-rse.org/jobs/'

    response = requests.get(rse_baseurl)
    soup = BeautifulSoup(response.text, 'html.parser')
    individual_pages = soup.findAll('a', attrs={'target': '_blank'})

    detect_rse_url = lambda x: re.findall(r'(?<=a href=").*(?=" target)', x)
    rse_urls = [detect_rse_url(str(x))[0] for x in individual_pages]
    url_list = rse_urls

    detect_rse_subject = lambda x: re.findall(r'(?<=blank">).*(?=<\/a\>)', x)
    subject_list = [detect_rse_subject(str(x))[0] for x in individual_pages]

    body_list = []
    for url in rse_urls:
        # url = rse_urls[0]
        # print(url)
        response = requests.get(url)
        soup = BeautifulSoup(response.text, 'html.parser')
        msg_body = soup.find_all(['p', 'td'])
        if len(msg_body) == 0:
            msg_body = "error"
        body_list.append(str(msg_body))

    res = pd.DataFrame({
        'subject': subject_list,
        'body': body_list,
        'url': url_list
    })
    res['source'] = '[USRSE]'
    res['subject'] = res['subject'] + ' | ' + \
        res['url'] + ' | ' + res['source']

    if len(res) > 0:
        res = utils.filter_limno(res)

    return res
Ejemplo n.º 6
0
def pull_ecoevo():
    data_start_row = 3
    sh = gc.open_by_url(
        "https://docs.google.com/spreadsheets/d/1hf_q-3gdyOlsk97I3OW97w_cmQXsKQVC-ZGDMgFnL2I/edit#gid=1954069648"
    )

    dt_raw_faculty = sh.values_batch_get([
        "Faculty Jobs!G3:G",
        "Faculty Jobs!D3:D",
        "Faculty Jobs!B3:B",
        "Faculty Jobs!J3:J",
        "Faculty Jobs!E3:E",
        "Faculty Jobs!F3:F",
    ])

    dt_faculty = pd.DataFrame({
        "rank":
        dt_raw_faculty["valueRanges"][0]["values"],
        "subject":
        dt_raw_faculty["valueRanges"][1]["values"],
        "institution":
        dt_raw_faculty["valueRanges"][2]["values"],
        "notes":
        dt_raw_faculty["valueRanges"][3]["values"],
        "closing_date":
        dt_raw_faculty["valueRanges"][4]["values"],
        "url":
        dt_raw_faculty["valueRanges"][5]["values"],
    })
    dt_faculty = dt_faculty.applymap(lambda x: "".join(map(str, x)))

    dt_raw_postdoc = sh.values_batch_get([
        "Postdoc Jobs!G3:G",
        "Postdoc Jobs!D3:D",
        "Postdoc Jobs!B3:B",
        "Postdoc Jobs!I3:I",
        "Postdoc Jobs!F3:F",
        "Postdoc Jobs!G3:G",
    ])

    dt_postdoc = pd.DataFrame({
        "rank":
        dt_raw_postdoc["valueRanges"][0]["values"],
        "subject":
        dt_raw_postdoc["valueRanges"][1]["values"],
        "institution":
        dt_raw_postdoc["valueRanges"][2]["values"],
        # "notes": dt_raw_postdoc['valueRanges'][3]['values'],
        "closing_date":
        dt_raw_postdoc["valueRanges"][4]["values"],
        "url":
        dt_raw_postdoc["valueRanges"][5]["values"],
    })
    dt_postdoc = dt_postdoc.applymap(lambda x: "".join(map(str, x)))
    dt_postdoc["rank"] = "Postdoc"
    # temporarily account for bad closing_date field
    # TODO: set nondate objects in the closing_date to an arbitrary future date
    dt_postdoc["closing_date"] = datetime.date.today().strftime("%m/%d/%Y")

    dt = pd.concat([dt_faculty, dt_postdoc])
    dt["source"] = "[ecoevojobs]"

    # remove closing_date < today
    dt["closing_date"] = pd.to_datetime(dt["closing_date"])
    dt = dt.loc[dt["closing_date"] >= pd.to_datetime(datetime.date.today())]

    subject = (dt["rank"] + " | " + dt["subject"] + " | " + dt["institution"] +
               " | " + dt["url"] + " | " + dt["source"])
    body = dt["notes"]

    res = pd.DataFrame({
        "subject": subject,
        "body": body,
        "source": dt["source"]
    })

    res = utils.filter_limno(res)

    return res
Ejemplo n.º 7
0
def pull_csdms():
    csdms_baseurl = 'https://csdms.colorado.edu/wiki/Jobs'

    response = requests.get(csdms_baseurl)
    soup = BeautifulSoup(response.text, 'html.parser')
    individual_pages = soup.findAll('a')

    detect_csdms_url = lambda x: re.findall(
        r'(?<=\<a href="\/wiki\/Jobs:)(.*)(?=" title="Jobs)', x)
    csdms_urls = [detect_csdms_url(str(x)) for x in individual_pages]
    csdms_urls = [csdms_baseurl + ":" + x[0] for x in csdms_urls if x != []]

    subject_list = []
    body_list = []
    url_list = []

    # TODO: pull only the first several csdms_urls bc we can assume function is run regularly
    for url in csdms_urls:
        # url = csdms_urls[2]
        # print(url)

        response = requests.get(url)
        soup = BeautifulSoup(response.text, 'html.parser')

        # subject
        msg_subject = soup.find_all('b')[0]
        detect_subject = lambda x: re.findall(r'(?<=\<b\>).*(?=<\/b>)', x)
        msg_subject = detect_subject(str(msg_subject))[0]

        # body
        msg_body = soup.find_all('p')
        if len(msg_body) == 0:
            msg_body = soup.find_all('div', class_='col-sm-9')

        # url
        msg_url = soup.find_all('a', class_='external text')
        if len(msg_url) > 0:
            detect_url = lambda x: re.findall(
                r'(?<=target="_blank">).*(?=<\/a>)', x)
            msg_url = detect_url(str(msg_url))[0]
        else:
            msg_url = url

        # --- #
        subject_list.append(str(msg_subject))
        body_list.append(str(msg_body[0]))
        url_list.append(msg_url)

    res = pd.DataFrame({
        'subject': subject_list,
        'body': body_list,
        'url': url_list
    })
    res['source'] = '[CSDMS]'
    res['subject'] = res['subject'] + ' | ' + \
        res['url'] + ' | ' + res['source']

    if len(res) > 0:
        res = utils.filter_limno(res)

    return res