def ScrapeTotalJobs(url): # Dictionary to hold standard job data. StandardData = {} StandardDataKeys = [ 'company', 'title', 'location', 'salary_min', 'salary_max' ] # Populate dictionary holding regular expressions used to extract job data from web. WebFieldRes = {} WebFieldRes['removed'] = 'expiry-message\">.*?<p>(.*?)</p>' WebFieldRes['title'] = '<h1 class=\"brand-font\">(.*?)</h1>' WebFieldRes['location'] = '<li class=\"location icon\">(.*?)</li>' WebFieldRes['commute'] = 'locationText\">.*?<ul>.*?<li>(.*?)</li' WebFieldRes['salary'] = '<li class=\"salary icon\">.*?<div>(.*?)</div>' WebFieldRes[ 'company'] = '<li class="company icon">.*?\"View jobs\">(.*?)</a>.*?</li>' WebFieldRes['job_type'] = '<li class=\"job-type icon\">.*?<div>(.*?)</div>' WebFieldRes[ 'expiry'] = '<li class=\"date-posted icon\">.*?<span>(.*?)</span>' # Request http content and convert to character stream #Httpresponse = requests.get(url,auth=('*****@*****.**','l142rpn'),timeout = (2,5)) Httpresponse = requests.get(url, timeout=(2, 5)) Httpcontent = Httpresponse.text.replace('\n', '') # Dictionary to hold web job data WebData = {} for WebFieldRe in WebFieldRes: WebData[WebFieldRe] = '' # Extract web data for WebFieldRe in WebFieldRes: Httpmatch = re.search(WebFieldRes[WebFieldRe], Httpcontent) if Httpmatch: WebData[WebFieldRe] = Httpmatch.group(1).strip() Httpcontent = Httpcontent[Httpmatch.end(1):] # Return an empty dictionary of data if the job has expired. if (WebData['removed'] == 'The job you are looking for is no longer available.'): return StandardData if (WebData['expiry'] == 'Expired'): return StandardData if (WebData['expiry'] == 'Recently'): return StandardData # Populate standard data dictionary StandardData['company'] = WebData['company'] StandardData['title'] = WebData['title'] # Standard location value needs to be derived from one of two web values Location = '' if (len(WebData['location']) != 0): LocationParts = re.findall('class=\"engagement-metric\">(.*?)<', WebData['location']) if not LocationParts: LocationParts = re.findall('<div>(.*?)<', WebData['location']) else: LocationParts = re.findall('class=\"engagement-metric\">(.*?)<', WebData['commute']) if not LocationParts: LocationParts = WebData['commute'].split(',') # Create location string. if LocationParts: for LocationPart in LocationParts: Location = Location + ',' + LocationPart StandardData['location'] = Location.lstrip(',') # Standard salary values need deriving form web value. SalaryValues = Interface.Extractsalarydata(WebData['salary']) StandardData['salary_min'] = int(SalaryValues[0]) StandardData['salary_max'] = int(SalaryValues[1]) return StandardData
def ScrapeIndeed(url): "scrapes additional job data from the web site" # Matrices for converting raw job data to standard job data. ConversionDict = { 'company': ['company'], 'title': ['title'], 'location': ['location'], 'salary_min': ['salary'], 'salary_max': ['salary'] } # set regular expressions titlere = 'jobsearch-JobInfoHeader-title\">(.*?)<' expiredre = 'icl-Alert-headline\">(.*?)</' valuere = '\"jobsearch-JobMetadataHeader-iconLabel\">(.*?)<' # valuere ='icl-u-xs-m[rt]--xs\">(.*?)<' # The above may work on some Indeed job adverts however # the display code looks too unstable to warrant any changes. keyre = 'icl-IconFunctional--(.*?) icl' # Retrieve web text Httpresponse = requests.get(url) Httplines = Httpresponse.text.split('\n') # Search for job title and other job attributes EngineJobData = {} Attributekeys = [] Attributevalues = [] ProcessedJobData = {} for Httpline in Httplines: Httpmatch = re.search(expiredre, Httpline) if Httpmatch: if (Httpmatch.group(1) == 'This job has expired on Indeed'): return ProcessedJobData Httpmatch = re.search(titlere, Httpline) if Httpmatch: EngineJobData['title'] = Httpmatch.group(1) Httpmatch = re.findall(keyre, Httpline) if Httpmatch: Attributekeys = Httpmatch Httpmatch = re.findall(valuere, Httpline) if Httpmatch: Attributevalues = Httpmatch # Create a distionary of job atributes Attributeindex = 0 while (Attributeindex < len(Attributekeys)): EngineJobData[ Attributekeys[Attributeindex]] = Attributevalues[Attributeindex] Attributeindex += 1 # Convert raw job data to standard job data for StandardDataKey in ConversionDict: StandardDataValue = '' for EngineDataKey in ConversionDict[StandardDataKey]: if EngineDataKey in EngineJobData: StandardDataValue = StandardDataValue + EngineJobData[ EngineDataKey] + ',' ProcessedJobData[StandardDataKey] = StandardDataValue.rstrip(',') # Convert salary strings. SalaryValues = Interface.Extractsalarydata(ProcessedJobData['salary_min']) ProcessedJobData['salary_min'] = int(SalaryValues[0]) ProcessedJobData['salary_max'] = int(SalaryValues[1]) return ProcessedJobData
def ScrapeReed(url): "scrapes additional job data from the web site" # Matrices for converting raw job data to standard job data. ConversionDict = { 'company': ['company'], 'title': ['title'], 'location': ['locality', 'region'], 'salary_min': ['salary'], 'salary_max': ['salary'] } # set regular expressions titlere = '<meta itemprop=\"title\" content=\"(.*?)\" />' companyre = '<span itemprop=\"name\">(.*?)<' keyre = '<span\s*data-qa=\"(.*?)MobileLbl\"\s*>' valuere = '<span\s*data-qa=\"(.*?)MobileLbl\"\s*>(.*?)<' expiredre = 'The following job is no longer available' # Retrieve web text Httpresponse = requests.get(url) Httplines = Httpresponse.text.split('\n') # Search for job title and other job attributes EngineJobData = {} Attributekeys = [] Attributevalues = [] ProcessedJobData = {} JobExpired = False for Httpline in Httplines: Httpmatch = re.search(titlere, Httpline) if Httpmatch: EngineJobData['title'] = Httpmatch.group(1) Httpmatch = re.search(companyre, Httpline) if Httpmatch: EngineJobData['company'] = Httpmatch.group(1) Httpmatch = re.search(keyre, Httpline) if Httpmatch: Attributekeys.append(Httpmatch.group(1)) Httpmatch = re.search(valuere, Httpline) if Httpmatch: Attributevalues.append(Httpmatch.group(2)) Httpmatch = re.search(expiredre, Httpline) if Httpmatch: JobExpired = True # Return with no data if job is expired if (JobExpired): return ProcessedJobData # Create a distionary of job atributes Attributeindex = 0 while (Attributeindex < len(Attributekeys)): EngineJobData[ Attributekeys[Attributeindex]] = Attributevalues[Attributeindex] Attributeindex += 1 # Convert raw job data to standard job data for StandardDataKey in ConversionDict: StandardDataValue = '' for EngineDataKey in ConversionDict[StandardDataKey]: if EngineDataKey in EngineJobData: StandardDataValue = StandardDataValue + EngineJobData[ EngineDataKey] + ',' ProcessedJobData[StandardDataKey] = StandardDataValue.rstrip(',') # Convert salary strings. SalaryValues = Interface.Extractsalarydata(ProcessedJobData['salary_min']) ProcessedJobData['salary_min'] = int(SalaryValues[0]) ProcessedJobData['salary_max'] = int(SalaryValues[1]) return ProcessedJobData
def ScrapeCVLibrary(url): "scrapes additional job data from the web site" # Matrices for converting raw job data to standard job data. ConversionDict = { 'company': ['JOB_COMPANY_NAME'], 'title': ['JOB_TITLE'], 'location': ['JOB_TOWN', 'JOB_COUNTY'], 'salary_min': ['JOB_SALARY'], 'salary_max': ['JOB_SALARY'] } # set regular expressions attributesre = 'dataLayer\.push\(\{(.*?)\}\)' # Set expiration flag JobExpired = False # Initialize job data ProcessedJobData = {} # Retrieve web text Httpresponse = requests.get(url) # Return no data if job has expires Httpheaders = Httpresponse.headers if (not 'Vary' in Httpheaders): return ProcessedJobData Httplines = Httpresponse.text.split('\n') # Search for required job data # in web text for Httpline in Httplines: Httpmatch = re.search(attributesre, Httpline) if Httpmatch: break # Process data line JobAttributes = Httpmatch.group(1) JobAttributes = JobAttributes.replace('\"', '') JobAttributes = JobAttributes.split(',') # Convert engine job attributes into key value pairs EngineJobData = {} for JobAttribute in JobAttributes: # It's possible that data doesn't follow <key>:<value> format. if ':' in JobAttribute: AttributePair = JobAttribute.split(':') EngineJobData[AttributePair[0]] = AttributePair[1] # Convert raw job data to standard job data for StandardDataKey in ConversionDict: StandardDataValue = '' for EngineDataKey in ConversionDict[StandardDataKey]: if EngineDataKey in EngineJobData: StandardDataValue = StandardDataValue + EngineJobData[ EngineDataKey] + ',' ProcessedJobData[StandardDataKey] = StandardDataValue.rstrip(',') # Convert salary strings. # LinkedIn does not display salaries in a consistent location SalaryValues = Interface.Extractsalarydata(ProcessedJobData['salary_min']) ProcessedJobData['salary_min'] = int(SalaryValues[0]) ProcessedJobData['salary_max'] = int(SalaryValues[1]) return ProcessedJobData