Esempio n. 1
0
def main():
    extra_headers = {
        'client_id': 'e4e3f73a-0ceb-4d37-939e-90ddb1238360',
        'Accept': 'application/json',
        'DNT': '1',
        'enc_data': 'rVXhR/l0GMCjq+aJJ/l0wOcesWjLwV6yFFXc6JqW46c=',
        'timestamp': 'Mon, 27 Jan 2020 20:36:06 GMT'
    }
    sc = ScraperRequest(base_url='https://www.webmd.com/',
                        extra_headers=extra_headers)
    input_file = './data/wedmdrx_mapping.csv'
    meds = read_csv(input_file)
    number_meds = len(meds)
    meds_processed = 0
    for i in meds:
        meds_processed += 1
        logger.info('Processing %d/%d => %s' %
                    (meds_processed, number_meds, i['webmd_slug']))
        ndc = i['ndc']
        output_file = './data/webmdrx_json_data/%s' % ndc
        url = 'https://www.webmd.com/search/2/api/rx/forms/v3/%s?app_id=web' % ndc
        if path.exists(output_file):
            continue
        try:
            data = sc.get_parsed_json(url)
            print(data)
            with open(output_file, "w") as write_file:
                json.dump(data, write_file)
        except:
            logger.error('Error in Getting Data for %s' % i['webmd_slug'])
Esempio n. 2
0
 def __init__(self,
              base_url,
              extra_headers=None,
              extra_cookies=None,
              useProxy=False):
     logger.info('Initiating Scraper')
     if extra_headers is not None:
         self._headers.update(extra_headers)
     if extra_cookies is not None:
         self._cookie_jar.update(extra_cookies)
     self._useProxy = useProxy
     self._get_response(base_url)
     self._session.headers.update(self._headers)
     return
Esempio n. 3
0
 def _get_response(self, url):
     # logger.info(str(self._cookie_jar.items()))
     self._requests_made += 1
     if self._useProxy and (self._proxies is None
                            or self._requests_made % 5 == 0):
         logger.info('Setting Up Proxy')
         self._proxies = self.get_proxy_url()
         time.sleep(1)
     if self._useProxy:
         response = self._session.get(url,
                                      cookies=self._cookie_jar,
                                      headers=self._headers,
                                      proxies=self._proxies)
     else:
         response = self._session.get(url,
                                      cookies=self._cookie_jar,
                                      headers=self._headers)
     # logger.info('Url Response : %s : %s ' % (response.status_code, url))
     return response
Esempio n. 4
0
def write_to_csv(filename,
                 data,
                 append=False,
                 delimiter=',',
                 ignoreFieldErrors=True,
                 header=None):  #data is assumed to be list of dicts
    if data is None or len(data) == 0 or type(data) is not list or type(
            data[0]) is not dict:
        logger.error('Data Not Correct')
        return

    if not path.exists(filename):
        append = False
    permissions = 'w' if not append else 'w+'

    if header is None:
        header = list(data[0].keys())

    if append:  #get header
        with open(filename) as headerfile:
            header = headerfile.readline().split(delimiter)

    logger.info('Header for output : %s' % header)

    num_lines = 0
    with open(filename, permissions) as output_file:
        dict_writer = csv.DictWriter(output_file, fieldnames=header)
        if not append:
            dict_writer.writeheader()
        header_set = frozenset(header)
        for row in data:
            row_headers = frozenset(row.keys())
            if ignoreFieldErrors:
                for fieldName in header_set - row_headers:
                    row[fieldName] = ''
                for fieldName in row_headers - header_set:
                    del row[fieldName]
            dict_writer.writerow(row)
            num_lines += 1
            if num_lines % 1000 == 0:
                logger.info("%d Lines Written to %s", num_lines, filename)
    logger.info("Total Lines Written to %s = %d ", filename, num_lines)
Esempio n. 5
0
 def get_parsed_json(self, url):
     response = self._get_response(url)
     logger.info(str(response.text))
     return json.loads(response.text)
Esempio n. 6
0
def read_set(filename, isstreaming=False):
    lines = []
    with open(filename, 'r') as reader:
        lines = [x.rstrip() for x in reader.readlines()]
    logger.info("Lines Read From " + filename + " = " + str(len(lines)))
    return lines
Esempio n. 7
0
def write_set(filename, list_rows, append=False):
    permissions = 'w' if not append else 'w+'
    with open(filename, permissions) as output_file:
        output_file.writelines(x + '\n' for x in list_rows)
    logger.info("Total Lines Written to %s = %d ", filename, len(list_rows))