class Datasource(CommonCrawlDatasource):
    name = module_name(__name__)
    query = "gumtree.com.au/s-ad/*"
    query_filters = [
        "~url:.*/(account-manager|account-relationship-management|accounting|accounts-officer-clerk|accounts-payable|accounts-receivable-credit-control|admin|administration-office-support|administrative-assistant|advertising-arts-media|aged-disability-support|agronomy-farm-services|air-conditioning-refrigeration|analysis-reporting|architecture|art-director|assembly-process-work|automotive-engineering|automotive-trades|bakers-pastry-chefs|banking-financial-services|banking-retail-branch|bar-beverage-staff|bookkeeping-small-practice-accounting|building-services-engineering|building-trades|business-services-corporate-advisory|butcher|call-centre-customer-service|carpentry-cabinet-making|chef-cook|child-welfare-youth-family-services|childcare-after-school-care|civil-structural-engineering|cleaner-housekeeper|coaching-instruction|commercial-sales-leasing-property-mgmt|community-services-development|construction|consulting-generalist-hr|contract-management|corporate-commercial-law|courier-driver-postal-service|customer-service-call-centre|customer-service-customer-facing|defence|dental-dentist|design-architecture|developer-programmer|digital-search-marketing|education-teaching|electrician|employment-services|engineering|event-management|facilities-management-body-corporate|farm-management|farming-veterinary|financial-accounting-reporting|financial-manager-controller|financial-planning|fitter-turner-machinist|florist|foreman-supervisor|freight-cargo-forwarding|front-office-guest-services|funds-management|gardening-landscaping|general-practitioner-gp-|generalist|government-defence|government|graphic-design|hair-beauty-services|healthcare-administration|healthcare-nursing|horticulture|hospitality-tourism|information-communication-technology|interaction-web-design|interior-design|it-support-help-desk|kitchen-sandwich-hand|labourer|legal-secretary|legal|locksmith|machine-operators|machine-plant-operator|maintenance-handyman|maintenance|management|manufacturing-transport-logistics|marketing-assistants|marketing-communications|marketing-communications|marketing-manager|mechanical-engineering|media-planning-strategy-buying|merchandiser|mining-engineering-maintenance|mining-operations|mining-resources-energy|mortgage-broker|nanny-babysitter|new-business-development|nursing|oil-gas-engineering-maintenance|oil-gas-operations|other-jobs|other|pa-ea-secretary|painter-sign-writer|paralegal-law-clerk|payroll-accounting|performing-arts|personal-trainer|pharmacy|physiotherapy-ot-rehabilitation|plumber|police-corrections-officer|printing-publishing-services|production-planning-scheduling|project-management|property-law|public-relations-corporate-affairs|purchasing-procurement-inventory|real-estate-property|receptionist|recruitment-agency|recruitment-hr|recruitment-internal|relationship-account-management|removalist|residential-leasing-property-management|residential-sales|retail-assistant|retail-management|retail|road-transport|sales-call-centre|sales-coordinator|sales-customer-facing|sales-management|sales-representative-consultant|sales|security-services|sports-management|sports-recreation|systems-business-analyst|tailor-dressmaker|taxation|teaching|technician|tour-guide|trade-marketing|trades-services|training-development|travel-agent-consultant|tutoring|vet-animal-welfare|waiting-staff|warehousing-storage-distribution|web-development-production|workplace-training-assessment|writing-journalist|welder-boilermaker)/"
    ]

    def extract(self, html: bytes, uri, view_date):
        text = html.decode("utf-8")
        obj = parse_js_obj(text, JS_STR_APP)
        if obj is None:
            return []
        else:
            data = obj["vip"]["item"]
            # adType: OFFER is job ad, WANTED is ask for work
            if data["isJobsCategory"] and data["adType"] == "OFFER":
                return [{"data": data, "uri": uri, "view_date": view_date}]
            else:
                return []

    def normalise(self, data, uri, view_date):
        metadata = {row["value"]: row["name"] for row in data["mainAttributes"]}
        salary_raw = metadata.get("Salary Detail")
        salary_data = get_salary_data(salary_raw)
        return {
            "title": data["title"],
            "description": html2plain(data["description"]),
            "uri": uri,
            "view_date": datetime_from_iso_utc(view_date),
            "org": None,
            **salary_data,
            "location_raw": data["mapAddress"],
            **AU_GEOCODER.geocode(data["mapAddress"]),
        }
class Datasource(CommonCrawlDatasource):
    name = module_name(__name__)

    def extract(self, html: bytes, base_url: str, view_date):
        data = extruct.extract(html, base_url, syntaxes=["json-ld"])["json-ld"]
        job_posts = [datum for datum in data if datum["@type"] == "JobPosting"]
        return [{
            "data": post,
            "uri": base_url,
            "view_date": view_date
        } for post in job_posts]

    def normalise(self, data, uri, view_date):
        if "description" in data:
            description = html2plain(data["description"])
        else:
            description = None

        org = data.get("hiringOrganization")
        if isinstance(org, dict):
            org = org["name"]

        return {
            "title": data["title"],
            "description": description,
            "uri": uri,
            "view_date": datetime_from_iso_utc(view_date),
            "org": org,
        }
class Datasource(CommonCrawlDatasource):
    name = module_name(__name__)

    def extract(self, html: bytes, base_url: str, view_date):
        data = extruct.extract(html, base_url, syntaxes=["microdata"])["microdata"]
        job_posts = [
            datum["properties"]
            for datum in data
            if datum["type"] == "http://schema.org/JobPosting"
        ]
        return [
            {"data": post, "uri": base_url, "view_date": view_date}
            for post in job_posts
        ]

    def normalise(self, data, uri, view_date):
        org = data["hiringOrganization"]
        if isinstance(org, dict):
            org = org.get("name")
        if not isinstance(org, str):
            org = None
        return {
            "title": data["title"],
            "description": html2plain(data.get("description", "")),
            "uri": uri,
            "view_date": datetime_from_iso_utc(view_date),
            "org": org,
        }
Example #4
0
class Datasource(CommonCrawlDatasource):
    name = module_name(__name__)
    query = "probonoaustralia.com.au/jobs/*"

    def extract(self, html: Union[bytes, str], uri, view_date):
        soup = bs4.BeautifulSoup(html, "html5lib")
        infos = soup.select(".org-basic-info > div > p.org-add")
        data = {}
        for info in infos:
            key = info.select_one("b")
            if not key:
                logging.warning("Missing key in %s; %s", uri, info)
                continue
            schema_key = key.get_text().strip()
            value = "".join(
                str(s.get_text() if isinstance(s, bs4.element.Tag) else s).strip()
                for s in key.next_siblings
            )
            data[schema_key] = value
        description = str(soup.select_one("#about-role") or "")
        hiringOrganization_description = str(
            soup.select_one("#about-organisation") or ""
        )
        header = soup.select_one("h1")
        if not header:
            logging.warning("Missing header: %s", uri)
            title = None
        else:
            title = header.get_text().strip()
        return [
            {
                "title": title,
                "description": description,
                "organisation_description": hiringOrganization_description,
                "metadata": data,
                "uri": uri,
                "view_date": view_date,
            }
        ]

    def normalise(
        self, title, description, organisation_description, metadata, uri, view_date
    ):
        salary_text = metadata.get("Salary :")
        salary_data = get_salary_data(salary_text)
        location_raw = metadata["Location :"]
        return {
            "title": title,
            "description": html2plain(description),
            "uri": uri,
            "view_date": datetime_from_iso_utc(view_date),
            "org": metadata.get("Organisation :"),
            **salary_data,
            "location_raw": location_raw,
            **AU_GEOCODER.geocode(fix_probono_location(location_raw)),
        }
Example #5
0
class Datasource(JSONLinkedDatasource):
    name = module_name(__name__)
    query = "ethicaljobs.com.au/members/*"

    def normalise(self, data, uri, view_date):
        ans = super().normalise(data, uri, view_date)
        # Salary not in metadata
        location_raw = location_jsonld(data)
        return {
            **ans,
            "location_raw": location_raw,
            **AU_GEOCODER.geocode(location_raw),
        }
Example #6
0
class Datasource(JSONLinkedDatasource):
    name = module_name(__name__)
    query = "www.cgcrecruitment.com/job/*"

    def normalise(self, data, uri, view_date):
        ans = super().normalise(data, uri, view_date)
        salary_raw = data["baseSalary"]["value"].get("value")
        salary = get_salary_data(salary_raw)
        location_raw = location_jsonld(data)
        return {
            **ans,
            **salary,
            "location_raw": location_raw,
            **AU_GEOCODER.geocode(location_raw),
        }
Example #7
0
class Datasource(MicrodataDatasource):
    name = module_name(__name__)
    query = "jobs.csiro.au/job/*"

    def normalise(self, data, uri, view_date):
        # Description is dometimes a list, e.g. CC-MAIN-2019-18
        if isinstance(data.get("description"), list):
            data["description"] = "\n".join(data["description"])
        ans = super().normalise(data, uri, view_date)
        # jobLocation *can* be an array
        location_raw = str(data.get("jobLocation") or "")
        return {
            **ans,
            "location_raw": location_raw,
            **AU_GEOCODER.geocode(location_raw),
        }
Example #8
0
class Datasource(CommonCrawlDatasource):
    name = module_name(__name__)
    query = "iworkfor.nsw.gov.au/job/*"

    def extract(self, html: Union[bytes, str], uri, view_date):
        soup = bs4.BeautifulSoup(html, "html5lib")
        body = soup.select_one("tbody")
        # Some pages are missing a body; e.g. CC-MAIN-2018-17
        if not body:
            return []
        infos = body.select("tr")
        data = {}
        for info in infos:
            key = info.select_one("th")
            value = info.select_one("td")
            if key and value:
                data[key.get_text().strip()] = value.get_text().strip()
        title_tag = soup.select_one(".job-detail-title")
        if not title_tag:
            logging.warning("Missing title tag in %s, %s", uri, view_date)
            title = None
        else:
            title = title_tag.get_text().strip()
        description = str(soup.select_one(".job-detail-des") or "")
        return [
            {
                "title": title,
                "description": description,
                "metadata": data,
                "uri": uri,
                "view_date": view_date,
            }
        ]

    def normalise(self, title, description, metadata, uri, view_date):
        salary = get_salary_data(metadata.get("Total Remuneration Package:") or "")
        location_raw = metadata["Job Location:"]
        return {
            "title": title,
            "description": html2plain(description),
            "uri": uri,
            "view_date": datetime_from_iso_utc(view_date),
            "org": metadata["Organisation/Entity:"],
            **salary,
            "location_raw": location_raw,
            **AU_GEOCODER.geocode(fixup_iworkfornsw_loc(location_raw)),
        }
class Datasource(CommonCrawlDatasource):

    name = module_name(__name__)
    query = "careers.vic.gov.au/job/*"

    def extract(self, html: Union[bytes, str], uri,
                view_date) -> List[Dict[Any, Any]]:
        soup = bs4.BeautifulSoup(html, "html5lib")
        data = {}
        for info in soup.select(".txt-info"):
            key = info.select_one(".txt-bold")
            if not key:
                continue
            key_text = key.get_text().strip()
            value = "".join(str(s).strip() for s in key.next_siblings)
            data[key_text] = value

            title_tag = soup.select_one(".txt-title")
            if not title_tag:
                logging.warning("Missing title tag in %s", uri)
                title = None
            else:
                title = str(title_tag.get_text())
            description = str(soup.select_one(".txt-pre-line") or "")
        return [{
            "title": title,
            "description": description,
            "metadata": data,
            "uri": uri,
            "view_date": view_date,
        }]

    def normalise(self, title, description, metadata, uri, view_date):
        salary_data = get_salary_data(
            metadata.get("Salary:") or metadata["Salary Range:"])
        location_raw = metadata.get("Location:") or metadata["Work location:"]
        return {
            "title": title,
            "description": html2plain(description),
            "uri": uri,
            "view_date": datetime_from_iso_utc(view_date),
            "org": metadata.get("Organisation:"),
            **salary_data,
            "location_raw": location_raw,
            **AU_GEOCODER.geocode(fixup_careers_vic_location(location_raw)),
        }
Example #10
0
class Datasource(KaggleDatasource):
    # License: CC0: Public Domain
    dataset = "promptcloud/latest-seek-australia-job-dataset"
    sources = {
        "seekau_2019q3":
        "marketing_sample_for_seek_au-jobs_listing__20190901_20191231__10k_data.json"
    }
    name = module_name(__name__)

    raw_extension = ".json"

    def extract_one(self, path: Path) -> Generator[Dict[Any, Any], None, None]:
        with open(path, "r") as f:
            for line in f:
                yield json.loads(line)

    def normalise(self, *args, **data) -> Dict[str, Any]:
        location_text = ", ".join([
            data["city"],
            data["state"],
            data.get("country") or data["inferred_country"],
        ])
        salary_text = data.get("salary_offered")
        return {
            "title":
            data["job_title"],
            "description":
            data["job_description"],
            "uri":
            data["url"],
            "view_date":
            datetime.datetime.strptime(data["crawl_timestamp"],
                                       "%Y-%m-%d %H:%M:%S +0000"),
            "org":
            data["company_name"],
            **get_salary_data(salary_text),
            "location_raw":
            location_text,
            **AU_GEOCODER.geocode(location_text),
        }
Example #11
0
class Datasource(KaggleDatasource):
    # License: CC0: Public Domain
    dataset = "santokalayil/data-scientist-jobs-in-australia-october-25-2019"
    sources = {
        "indeedau_datascience_202010":
        "datascientist_jobs_in_australia_Oct_25_2019.csv"
    }
    raw_extension = ".csv"
    name = module_name(__name__)

    def extract_one(self, path: Path) -> Generator[Dict[Any, Any], None, None]:
        with open(path, "r", encoding="latin-1") as f:
            for row in csv.DictReader(f):
                yield {k: v for k, v in row.items() if k}

    def normalise(self, *args, **data) -> Dict[str, Any]:
        return {
            "title": data["title"],
            "description": data["summary"],
            "view_date": datetime.datetime(2019, 10, 25),
            "org": data["company"],
            "location_raw": data["location"],
            **AU_GEOCODER.geocode(data["location"]),
        }
class Datasource(KaggleDatasource):
    # License: CC BY-SA 4.0
    dataset = "PromptCloudHQ/australian-job-listings-data-from-seek-job-board"
    sources = {"seekau": "seek_australia_sample.csv"}
    raw_extension = ".csv"
    name = module_name(__name__)

    def extract_one(self, path: Path) -> Generator[Dict[Any, Any], None, None]:
        with open(path, "r", encoding="latin-1") as f:
            for row in csv.DictReader(f):
                yield row

    def normalise(self, *args, **data) -> Dict[str, Any]:
        parts = [
            x for x in [data.get("city"),
                        data.get("state"),
                        data.get("geo")] if x
        ]
        location_text = ", ".join(parts)
        return {
            "title":
            data["job_title"],
            "description":
            data["job_description"],
            "uri":
            data["pageurl"],
            "view_date":
            datetime.datetime.strptime(data["crawl_timestamp"],
                                       "%Y-%m-%d %H:%M:%S +0000"),
            "org":
            data["company_name"],
            **get_salary_data(data.get("salary_offered")),
            "location_raw":
            location_text,
            **AU_GEOCODER.geocode(location_text),
        }
Example #13
0
class Datasource(CommonCrawlDatasource):
    name = module_name(__name__)
    query = "seek.com.au/job/*"
    query_filters = ["!~url:.*/apply/*"]

    def extract(self, html: bytes, uri, view_date):
        text = html.decode("utf-8")
        obj = parse_js_obj(text, JS_STR_REDUX)
        if obj is None:
            return []
        else:
            return [{
                "data": obj["jobdetails"]["result"],
                "uri": uri,
                "view_date": view_date,
            }]

    def normalise(self, data, uri, view_date):
        salary_text = data["salary"]
        location = data["locationHierarchy"]
        location_text = ", ".join([
            location["suburb"],
            location["city"],
            location["state"],
            location["nation"],
        ])
        return {
            "title": data["title"],
            "description": html2plain(data["mobileAdTemplate"]),
            "uri": uri,
            "view_date": datetime_from_iso_utc(view_date),
            "org": data["advertiser"]["description"],
            **get_salary_data(salary_text),
            "location_raw": location_text,
            **AU_GEOCODER.geocode(location_text),
        }
class Datasource(KaggleDatasource):
    # License: CC BY-NC-SA 4.0
    dataset = "PromptCloudHQ/australian-jobs-on-gumtreecomau"
    sources = {"gumtreeau": "gumtree_com_au-sample.csv"}
    raw_extension = ".csv"
    name = module_name(__name__)

    def extract_one(self, path: Path) -> Generator[Dict[Any, Any], None, None]:
        with open(path, "r") as f:
            for row in csv.DictReader(f):
                yield row

    def normalise(self, *args, **data) -> Dict[str, Any]:
        return {
            "title": data["job_title"],
            "description": data["job_description"],
            "uri": data["page_url"],
            # Not quite true; this is date added
            "view_date": datetime.datetime.strptime(data["date_added"],
                                                    "%d/%m/%Y"),
            **get_salary_data(data["salary"]),
            "location_raw": data["location"],
            **AU_GEOCODER.geocode(data["location"]),
        }