def main(): extra_headers = { 'client_id': 'e4e3f73a-0ceb-4d37-939e-90ddb1238360', 'Accept': 'application/json', 'DNT': '1', 'enc_data': 'rVXhR/l0GMCjq+aJJ/l0wOcesWjLwV6yFFXc6JqW46c=', 'timestamp': 'Mon, 27 Jan 2020 20:36:06 GMT' } sc = ScraperRequest(base_url='https://www.webmd.com/', extra_headers=extra_headers) input_file = './data/wedmdrx_mapping.csv' meds = read_csv(input_file) number_meds = len(meds) meds_processed = 0 for i in meds: meds_processed += 1 logger.info('Processing %d/%d => %s' % (meds_processed, number_meds, i['webmd_slug'])) ndc = i['ndc'] output_file = './data/webmdrx_json_data/%s' % ndc url = 'https://www.webmd.com/search/2/api/rx/forms/v3/%s?app_id=web' % ndc if path.exists(output_file): continue try: data = sc.get_parsed_json(url) print(data) with open(output_file, "w") as write_file: json.dump(data, write_file) except: logger.error('Error in Getting Data for %s' % i['webmd_slug'])
def __init__(self, base_url, extra_headers=None, extra_cookies=None, useProxy=False): logger.info('Initiating Scraper') if extra_headers is not None: self._headers.update(extra_headers) if extra_cookies is not None: self._cookie_jar.update(extra_cookies) self._useProxy = useProxy self._get_response(base_url) self._session.headers.update(self._headers) return
def _get_response(self, url): # logger.info(str(self._cookie_jar.items())) self._requests_made += 1 if self._useProxy and (self._proxies is None or self._requests_made % 5 == 0): logger.info('Setting Up Proxy') self._proxies = self.get_proxy_url() time.sleep(1) if self._useProxy: response = self._session.get(url, cookies=self._cookie_jar, headers=self._headers, proxies=self._proxies) else: response = self._session.get(url, cookies=self._cookie_jar, headers=self._headers) # logger.info('Url Response : %s : %s ' % (response.status_code, url)) return response
def write_to_csv(filename, data, append=False, delimiter=',', ignoreFieldErrors=True, header=None): #data is assumed to be list of dicts if data is None or len(data) == 0 or type(data) is not list or type( data[0]) is not dict: logger.error('Data Not Correct') return if not path.exists(filename): append = False permissions = 'w' if not append else 'w+' if header is None: header = list(data[0].keys()) if append: #get header with open(filename) as headerfile: header = headerfile.readline().split(delimiter) logger.info('Header for output : %s' % header) num_lines = 0 with open(filename, permissions) as output_file: dict_writer = csv.DictWriter(output_file, fieldnames=header) if not append: dict_writer.writeheader() header_set = frozenset(header) for row in data: row_headers = frozenset(row.keys()) if ignoreFieldErrors: for fieldName in header_set - row_headers: row[fieldName] = '' for fieldName in row_headers - header_set: del row[fieldName] dict_writer.writerow(row) num_lines += 1 if num_lines % 1000 == 0: logger.info("%d Lines Written to %s", num_lines, filename) logger.info("Total Lines Written to %s = %d ", filename, num_lines)
def get_parsed_json(self, url): response = self._get_response(url) logger.info(str(response.text)) return json.loads(response.text)
def read_set(filename, isstreaming=False): lines = [] with open(filename, 'r') as reader: lines = [x.rstrip() for x in reader.readlines()] logger.info("Lines Read From " + filename + " = " + str(len(lines))) return lines
def write_set(filename, list_rows, append=False): permissions = 'w' if not append else 'w+' with open(filename, permissions) as output_file: output_file.writelines(x + '\n' for x in list_rows) logger.info("Total Lines Written to %s = %d ", filename, len(list_rows))