def __init__(self, url=DEFAULT_BASE_URL, credentials=DEFAULT_CREDENTIALS, schema=DEFAULT_SCHEMA, verbose=False): # get input(s) self.url = url self.credentials = credentials self.schema = schema self.verbose = verbose # private variable(s) self.__response = None self.__soup = None self.__username, self.__password = self.__credentials.split(f':') self.__after = [] self.__aka = [] self.__before = [] self.__dates = [] self.__events = [] self.__names = [] self.__columns = 0 self.__headers = [] self.__rows = 0 self.__authorization = { 'username': self.__username, 'password': self.__password } self.__user_agent = { 'user-agent': TNS_USER_AGENT.replace('<username>', self.__username) } # verbose self.__log = UtilsLogger( 'LigoQ3cScrapeUserAgent').logger if self.__verbose else None if self.__verbose: self.__log.info(f"url='{self.__url}'") self.__log.info(f"credentials='{self.__credentials}'") self.__log.info(f"schema={self.__schema}") self.__log.info(f"verbose={self.__verbose}") self.__log.info(f"log={self.__log}") self.__log.info(f"authorization={self.__authorization}") self.__log.info(f"user_agent={self.__user_agent}") # login try: self.__session = requests.Session() self.__session.post(f'{self.__url}/user', data=self.__user_agent) except Exception as e: self.__session = None self.__log.debug(f"self.__session={self.__session}, error={e}")
def __init__(self, url=DEFAULT_LOGIN_URL, credentials=DEFAULT_CREDENTIALS, number=DEFAULT_NUMBER, unit=DEFAULT_UNIT, verbose=False): # get input(s) self.url = url self.credentials = credentials self.number = number self.unit = unit self.verbose = verbose # private variable(s) self.__ans = None self.__log = None self.__response = None self.__soup = None self.__params = None self.__pages = -1 self.__session = None self.__total = -1 self.__user_agent = None self.__authorization = { 'username': self.__username, 'password': self.__password } self.__user_agent = { 'user-agent': TNS_USER_AGENT.replace('<username>', self.__username) } # verbose self.__log = UtilsLogger( 'TnsQ3cScrapeUserAgent').logger if self.__verbose else None if self.__verbose: self.__log.info(f"url='{self.__url}'") self.__log.info(f"credentials='{self.__credentials}'") self.__log.info(f"number={self.__number}") self.__log.info(f"unit='{self.__unit}'") self.__log.info(f"verbose={self.__verbose}") self.__log.info(f"log={self.__log}") self.__log.info(f"authorization={self.__authorization}") self.__log.info(f"user_agent={self.__user_agent}") # login try: self.__session = requests.Session() self.__session.post(self.__url, data=self.__user_agent) except Exception as e: self.__session = None self.__log.debug(f"self.__session={self.__session}, error={e}")
class LigoQ3cTableParser(object): # + # method: __init__() # - def __init__(self, url=DEFAULT_BASE_URL, credentials=DEFAULT_CREDENTIALS, schema=DEFAULT_SCHEMA, verbose=False): # get input(s) self.url = url self.credentials = credentials self.schema = schema self.verbose = verbose # private variable(s) self.__response = None self.__soup = None self.__username, self.__password = self.__credentials.split(f':') self.__after = [] self.__aka = [] self.__before = [] self.__dates = [] self.__events = [] self.__names = [] self.__columns = 0 self.__headers = [] self.__rows = 0 self.__authorization = { 'username': self.__username, 'password': self.__password } self.__user_agent = { 'user-agent': TNS_USER_AGENT.replace('<username>', self.__username) } # verbose self.__log = UtilsLogger( 'LigoQ3cScrapeUserAgent').logger if self.__verbose else None if self.__verbose: self.__log.info(f"url='{self.__url}'") self.__log.info(f"credentials='{self.__credentials}'") self.__log.info(f"schema={self.__schema}") self.__log.info(f"verbose={self.__verbose}") self.__log.info(f"log={self.__log}") self.__log.info(f"authorization={self.__authorization}") self.__log.info(f"user_agent={self.__user_agent}") # login try: self.__session = requests.Session() self.__session.post(f'{self.__url}/user', data=self.__user_agent) except Exception as e: self.__session = None self.__log.debug(f"self.__session={self.__session}, error={e}") # + # decorator(s) # - @property def url(self): return self.__url @url.setter def url(self, url): self.__url = url if (isinstance(url, str) and url.strip() != f'' and url.lower().startswith(f'http')) else DEFAULT_URL @property def credentials(self): return self.__credentials @credentials.setter def credentials(self, credentials): self.__credentials = credentials if ( isinstance(credentials, str) and credentials.strip() != f'' and f':' in credentials) else DEFAULT_CREDENTIALS self.__username, self.__password = self.__credentials.split(f':') @property def log(self): return self.__log @log.setter def log(self, log): self.__log = UtilsLogger( 'LigoQ3cScrapeUserAgent').logger if self.__verbose else None @property def schema(self): return self.__schema @schema.setter def schema(self, schema): self.__schema = schema.lower( ) if schema in TNS_LIGO_SUPPORTED_SCHEMAS else DEFAULT_SCHEMA @property def verbose(self): return self.__verbose @verbose.setter def verbose(self, verbose): self.__verbose = verbose if isinstance(verbose, bool) else False @property def authorization(self): return self.__authorization @property def user_agent(self): return self.__user_agent # + # method: dump() # - @staticmethod def dump(_item=None, _delimiter='\n'): if _item is None: _res = f'' elif isinstance(_item, tuple) and _item is not (): _res = f''.join(f'{_v}{_delimiter}' for _v in _item)[:-1] elif isinstance(_item, list) and _item is not []: _res = f''.join(f'{_v}{_delimiter}' for _v in _item)[:-1] elif isinstance(_item, set) and _item is not {}: _res = f''.join(f'{_v}{_delimiter}' for _v in _item)[:-1] elif isinstance(_item, dict) and _item is not {}: _res = f''.join(f'{_k}={_v}{_delimiter}' for _k, _v in _item.items())[:-1] else: _res = f'{str(_item)}' return _res # + # method: get_after() # - def get_after(self, _table=None): """ gets links labelled as after the event """ # check input(s) self.__after = [] if _table is None or not hasattr(_table, f'find_all'): return # find <td class="cell-downloads"></td> elements for _td in _table.find_all('td', attrs={'class': 'cell-downloads'}): # find <a href="/ligo/event/*"></a> elements for _a in _td.find_all('a', href=True): # check names are in correct format if _a['href'].strip().lower().startswith(f'http') \ and _a['href'].strip().lower().endswith(self.__schema): # check for search pattern if f'after' in _a['href'].strip().lower( ) and f"{_a['href'].strip()}" not in self.__after: self.__after.append(f"{_a['href'].strip()}") # + # method: get_aka() # - def get_aka(self, _table=None): """ get the also-known-as name of the event """ # check input(s) self.__aka = [] if _table is None or not hasattr(_table, f'find_all'): return # find <td class="cell-name"></td> elements for _td in _table.find_all('td', attrs={'class': 'cell-name'}): # find <a href="/ligo/event/*"></a> elements for _a in _td.find_all('a', href=True): # check names are in correct format if _a['href'].lower().startswith(f'/ligo/event'): # check for search pattern m = TNS_LIGO_SUPPORTED_EVENTS.search(_a['href']) if m and f'{m.group().strip()}' not in self.__aka: self.__aka.append(f'{m.group()}') # + # method: get_attributes() # - def get_attributes(self, _table=None): """ get table attributes """ # check input(s) self.__columns, self.__headers, self.__rows = 0, [], 0 if _table is None or not hasattr(_table, f'find_all'): return # get headers for _th in _table.find_all('th'): if f'{_th.text.strip()}' not in self.__headers: self.__headers.append(f'{_th.text.strip()}') # get number of columns self.__columns = len(self.__headers) # get number of rows for row in _table.find_all('tr'): if len(row.find_all('td')) > 0: self.__rows += 1 if self.__verbose: self.__log.debug(f"self.__columns={self.__columns}") self.__log.debug(f"self.__headers={self.__headers}") self.__log.debug(f"self.__rows={self.__rows}") # + # method: get_before() # - def get_before(self, _table=None): """ gets links labelled as before the event """ # check input(s) self.__before = [] if _table is None or not hasattr(_table, f'find_all'): return # find <td class="cell-downloads"></td> elements for _td in _table.find_all('td', attrs={'class': 'cell-downloads'}): # find <a href="/ligo/event/*"></a> elements for _a in _td.find_all('a', href=True): # check names are in correct format if _a['href'].strip().lower().startswith(f'http') \ and _a['href'].strip().lower().endswith(self.__schema): # check for search pattern if f'before' in _a['href'].strip().lower( ) and f"{_a['href'].strip()}" not in self.__before: self.__before.append(f"{_a['href'].strip()}") # + # method: get_dates() # - def get_dates(self, _table=None): """ get dates of events """ # check input(s) self.__dates = [] if _table is None or not hasattr(_table, f'find_all'): return # find <td class="cell-date"></td> elements for _td in _table.find_all('td', attrs={'class': 'cell-date'}): if f'{_td.text.strip()}' not in self.__dates: self.__dates.append(f'{_td.text.strip()}') # + # method: get_events() # - def get_events(self, _table=None): """ get all data associated with events """ # check input(s) if _table is None or not hasattr(_table, f'find_all'): return {} # get data self.get_attributes(_table) self.get_after(_table) self.get_aka(_table) self.get_before(_table) self.get_dates(_table) self.get_names(_table) # message(s) if self.__verbose: self.__log.debug( f"self.__after={self.dump(self.__after, ' ')}, len={len(self.__after)}" ) self.__log.debug( f"self.__aka={self.dump(self.__aka, ' ')}, len={len(self.__aka)}" ) self.__log.debug( f"self.__before={self.dump(self.__before, ' ')}, len={len(self.__before)}" ) self.__log.debug( f"self.__dates={self.dump(self.__dates, ' ')}, len={len(self.__dates)}" ) self.__log.debug( f"self.__names={self.dump(self.__names, ' ')}, len={len(self.__names)}" ) # return - this is horrible, it really needs splitting up somehow _ans = {} for _i in range(self.__rows): if self.__verbose: self.__log.debug(f"scraping row {_i}") # get before data from json if self.__schema == 'json' and list_has_index(self.__before, _i): _before = self.scrape_json(self.__before[_i]) if _before is not None: for _bk, _bv in _before.items(): _name = f"{self.__names[_i]}-{_bk.strip()}" _suffix = f'{get_unique_hash()}'[:6] if f'{_name}' in _ans: _name = f'{_name}-{_suffix}' if self.__verbose: self.__log.debug( f"Creating new dictionary element, _ans['{_name}']" ) _ans[f'{_name}'] = { 'name': f'{_name}', 'name_prefix': _bv['name_prefix'] if 'name_prefix' in _bv else '', 'name_suffix': _suffix if f'{_suffix}' in _name else '', 'ra': float(_bv['ra']) if 'ra' in _bv else math.nan, 'dec': float(_bv['dec']) if 'dec' in _bv else math.nan, 'transient_type': _bv['type'] if 'type' in _bv else '', 'discovery_date': _bv['discoverydate'] if 'discoverydate' in _bv else '', 'discovery_mag': float(_bv['discoverymag']) if 'discoverymag' in _bv else math.nan, 'filter_name': _bv['filter'] if 'filter' in _bv else '', 'source_group': _bv['source_group'] if 'source_group' in _bv else '', 'probability': float(_bv['probability']) if 'probability' in _bv else math.nan, 'sigma': float(_bv['sigma']) if 'sigma' in _bv else math.nan, 'gw_aka': self.__aka[_i], 'gw_event': self.__names[_i], 'gw_date': self.__dates[_i], 'before': True } # get before data from tsv elif self.__schema == 'tsv' and list_has_index(self.__before, _i): _before = self.scrape_tsv(self.__before[_i]) if _before is not None: _before = _before.split('\n') _hdr = _before[0].split('\t') for _e in _before[1:]: _before_entry = dict(zip(_hdr, _e.split('\t'))) if len(_hdr) != len(_before_entry): continue _name = f"{self.__names[_i]}-{_before_entry['name'].strip()}" _suffix = f'{get_unique_hash()}'[:6] if f'{_name}' in _ans: _name = f'{_name}-{_suffix}' if self.__verbose: self.__log.debug( f"Creating new dictionary element, _ans['{_name}']" ) _ans[f'{_name}'] = { 'name': f'{_name}', 'name_prefix': _before_entry['name_prefix'] if 'name_prefix' in _before_entry else '', 'name_suffix': _suffix if f'{_suffix}' in _name else '', 'ra': float(_before_entry['ra']) if 'ra' in _before_entry else math.nan, 'dec': float(_before_entry['dec']) if 'dec' in _before_entry else math.nan, 'transient_type': _before_entry['type'] if 'type' in _before_entry else '', 'discovery_date': _before_entry['discoverydate'] if 'discoverydate' in _before_entry else '', 'discovery_mag': float(_before_entry['discoverymag']) if 'discoverymag' in _before_entry else math.nan, 'filter_name': _before_entry['filter'] if 'filter' in _before_entry else '', 'source_group': _before_entry['source_group'] if 'source_group' in _before_entry else '', 'probability': float(_before_entry['probability']) if 'probability' in _before_entry else math.nan, 'sigma': float(_before_entry['sigma']) if 'sigma' in _before_entry else math.nan, 'gw_aka': self.__aka[_i], 'gw_event': self.__names[_i], 'gw_date': self.__dates[_i], 'before': True } # get after data from json if self.__schema == 'json' and list_has_index(self.__after, _i): _after = self.scrape_json(self.__after[_i]) if _after is not None: for _ak, _av in _after.items(): _name = f"{self.__names[_i]}-{_ak.strip()}" _suffix = f'{get_unique_hash()[:6]}' if f'{_name}' in _ans: _name = f'{_name}-{_suffix}' if self.__verbose: self.__log.debug( f"Creating new dictionary element, _ans['{_name}']" ) _ans[f'{_name}'] = { 'name': f'{_name}', 'name_prefix': _av['name_prefix'] if 'name_prefix' in _av else '', 'name_suffix': _suffix if f'{_suffix}' in _name else '', 'ra': float(_av['ra']) if 'ra' in _av else math.nan, 'dec': float(_av['dec']) if 'dec' in _av else math.nan, 'transient_type': _av['type'] if 'type' in _av else '', 'discovery_date': _av['discoverydate'] if 'discoverydate' in _av else '', 'discovery_mag': float(_av['discoverymag']) if 'discoverymag' in _av else math.nan, 'filter_name': _av['filter'] if 'filter' in _av else '', 'source_group': _av['source_group'] if 'source_group' in _av else '', 'probability': float(_av['probability']) if 'probability' in _av else math.nan, 'sigma': float(_av['sigma']) if 'sigma' in _av else math.nan, 'gw_aka': self.__aka[_i], 'gw_event': self.__names[_i], 'gw_date': self.__dates[_i], 'before': False } # get after data from tsv elif self.__schema == 'tsv' and list_has_index(self.__after, _i): _after = self.scrape_tsv(self.__after[_i]) if _after is not None: _after = _after.split('\n') _hdr = _after[0].split('\t') for _e in _after[1:]: _after_entry = dict(zip(_hdr, _e.split('\t'))) if len(_hdr) != len(_after_entry): continue _name = f"{self.__names[_i]}-{_after_entry['name'].strip()}" _suffix = f'{get_unique_hash()[:6]}' if f'{_name}' in _ans: _name = f'{_name}-{_suffix}' if self.__verbose: self.__log.debug( f"Creating new dictionary element, _ans['{_name}']" ) _ans[f'{_name}'] = { 'name': f'{_name}', 'name_prefix': _after_entry['name_prefix'] if 'name_prefix' in _after_entry else '', 'name_suffix': _suffix if f'{_suffix}' in _name else '', 'ra': float(_after_entry['ra']) if 'ra' in _after_entry else math.nan, 'dec': float(_after_entry['dec']) if 'dec' in _after_entry else math.nan, 'transient_type': _after_entry['type'] if 'type' in _after_entry else '', 'discovery_date': _after_entry['discoverydate'] if 'discoverydate' in _after_entry else '', 'discovery_mag': float(_after_entry['discoverymag']) if 'discoverymag' in _after_entry else math.nan, 'filter_name': _after_entry['filter'] if 'filter' in _after_entry else '', 'source_group': _after_entry['source_group'] if 'source_group' in _after_entry else '', 'probability': float(_after_entry['probability']) if 'probability' in _after_entry else math.nan, 'sigma': float(_after_entry['sigma']) if 'sigma' in _after_entry else math.nan, 'gw_aka': self.__aka[_i], 'gw_event': self.__names[_i], 'gw_date': self.__dates[_i], 'before': False } # return return _ans # + # method: get_names() # - def get_names(self, _table=None): """ get names of events """ # check input(s) self.__names = [] if _table is None or not hasattr(_table, f'find_all'): return # find <td class="cell-name"></td> elements for _td in _table.find_all('td', attrs={'class': 'cell-name'}): # find <a href="/ligo/event/*"></a> elements for _a in _td.find_all('a', href=True): # check names are in correct format if _a['href'].lower().startswith(f'/ligo/event'): self.__names.append(f'{_a.text.strip()}') # + # method: get_request() # - def get_request(self): """ get data from web-site """ # noinspection PyBroadException try: if self.__verbose: self.__log.debug( f"Calling requests.get('{self.__url}', auth='{self.__username, self.__password}')" ) _requests = requests.get(f'{self.__url}/ligo/events', headers=self.__user_agent, auth=(self.__username, self.__password)) if self.__verbose: self.__log.debug( f"Called requests.get('{self.__url}', " f"auth='{self.__username, self.__password}'), _requests={_requests}" ) if _requests.status_code == 200: return _requests else: if self.__verbose: self.__log.error( f"Bad status code ({_requests.status_code}) calling requests.get('{self.__url}', " f"auth='{self.__username, self.__password}')") return None except Exception as e: if self.__verbose: self.__log.error( f"Failed calling requests.get('{self.__url}', " f"auth='{self.__username, self.__password}'), error={e}") return None # + # method: scrape_json() # - def scrape_json(self, _url=''): """ get json data from web-site """ # return data if _url.strip().lower().startswith( f'http') and _url.strip().lower().endswith('json'): self.__url = _url self.__response = self.get_request() if self.__response is not None: return self.__response.json() else: return None # + # method: scrape_tsv() # - def scrape_tsv(self, _url=''): """ get tsv data from web-site """ # return data if _url.strip().lower().startswith( f'http') and _url.strip().lower().endswith('tsv'): self.__url = _url self.__response = self.get_request() if self.__response is not None: return self.__response.content.decode() else: return None # + # method: scrape_ligo_q3c_events() # - def scrape_ligo_q3c_events(self): """ scrape web-site for ligo_q3c events """ # get data self.__response = self.get_request() if self.__verbose: self.__log.debug(f'self.__response={self.__response}') # check response if self.__response is None: return [], 0 # set up encoding _http_encoding = self.__response.encoding if 'charset' in self.__response.headers.get( 'content-type', '').lower() else None _html_encoding = EncodingDetector.find_declared_encoding( self.__response.content, is_html=True) # return data try: self.__soup = BeautifulSoup(self.__response.text, features='html5lib', from_encoding=(_html_encoding or _http_encoding)) except Exception as e: if self.__verbose: self.__log.error( f'Failed to get soup from self.__response, error={e}') return [], 0 if self.__verbose: self.__log.info( self.__soup.find_all('table', attrs={'class': 'ligo-alerts-table'})) try: _ret = [ self.get_events(_t) for _t in self.__soup.find_all( 'table', attrs={'class': 'ligo-alerts-table'}) ] return _ret, len(_ret) except Exception as e: if self.__verbose: self.__log.error( f'Failed to get events from self.__response, error={e}') return [], 0
def log(self, log): self.__log = UtilsLogger( 'LigoQ3cScrapeUserAgent').logger if self.__verbose else None
from src.models.ligo_q3c import LigoQ3cRecord from sqlalchemy import create_engine from sqlalchemy.orm import sessionmaker # noinspection PyUnresolvedReferences from utils import UtilsLogger # + # suppress all warnings! # - warnings.filterwarnings('ignore') # + # logging # - _log = UtilsLogger('LigoQ3c').logger # + # default(s) # - DEFAULT_BASE_URL = f'https://www.wis-tns.org' DEFAULT_LOGIN_URL = f'{DEFAULT_BASE_URL}/user' DEFAULT_SEARCH_URL = f'{DEFAULT_BASE_URL}/ligo/events' DEFAULT_CREDENTIALS = f':' DEFAULT_SCHEMA = f'json' # + # constant(s) # - SASSY_DB_HOST = os.getenv('SASSY_DB_HOST', None) SASSY_DB_NAME = os.getenv('SASSY_DB_NAME', None)
class TnsQ3cTableParser(object): # + # method: __init__() # - def __init__(self, url=DEFAULT_LOGIN_URL, credentials=DEFAULT_CREDENTIALS, number=DEFAULT_NUMBER, unit=DEFAULT_UNIT, verbose=False): # get input(s) self.url = url self.credentials = credentials self.number = number self.unit = unit self.verbose = verbose # private variable(s) self.__ans = None self.__log = None self.__response = None self.__soup = None self.__params = None self.__pages = -1 self.__session = None self.__total = -1 self.__user_agent = None self.__authorization = { 'username': self.__username, 'password': self.__password } self.__user_agent = { 'user-agent': TNS_USER_AGENT.replace('<username>', self.__username) } # verbose self.__log = UtilsLogger( 'TnsQ3cScrapeUserAgent').logger if self.__verbose else None if self.__verbose: self.__log.info(f"url='{self.__url}'") self.__log.info(f"credentials='{self.__credentials}'") self.__log.info(f"number={self.__number}") self.__log.info(f"unit='{self.__unit}'") self.__log.info(f"verbose={self.__verbose}") self.__log.info(f"log={self.__log}") self.__log.info(f"authorization={self.__authorization}") self.__log.info(f"user_agent={self.__user_agent}") # login try: self.__session = requests.Session() self.__session.post(self.__url, data=self.__user_agent) except Exception as e: self.__session = None self.__log.debug(f"self.__session={self.__session}, error={e}") # + # decorator(s) # - @property def url(self): return self.__url @url.setter def url(self, url): self.__url = url if ( isinstance(url, str) and url.lower().startswith(f'http')) else DEFAULT_LOGIN_URL @property def credentials(self): return self.__credentials @credentials.setter def credentials(self, credentials): self.__credentials = credentials if (isinstance( credentials, str) and f':' in credentials) else DEFAULT_CREDENTIALS self.__username, self.__password = self.__credentials.split(f':') @property def log(self): return self.__log @log.setter def log(self, log): self.__log = UtilsLogger( 'TnsQ3cScrapeUserAgent').logger if self.__verbose else None @property def number(self): return self.__number @number.setter def number(self, number): self.__number = number if (isinstance(number, int) and number > 0) else DEFAULT_NUMBER @property def unit(self): return self.__unit @unit.setter def unit(self, unit): self.__unit = unit if (isinstance(unit, str) and unit in DEFAULT_UNITS) else DEFAULT_UNIT @property def verbose(self): return self.__verbose @verbose.setter def verbose(self, verbose): self.__verbose = verbose if isinstance(verbose, bool) else False @property def athorization(self): return self.__athorization @property def user_agent(self): return self.__user_agent # + # method: dump() # - def dump(self, _item=None, _delimiter='\n'): if _item is None: _res = f'' elif isinstance(_item, tuple) and _item is not (): _res = f''.join(f'{_v}{_delimiter}' for _v in _item)[:-1] elif isinstance(_item, list) and _item is not []: _res = f''.join(f'{_v}{_delimiter}' for _v in _item)[:-1] elif isinstance(_item, set) and _item is not {}: _res = f''.join(f'{_v}{_delimiter}' for _v in _item)[:-1] elif isinstance(_item, dict) and _item is not {}: _res = f''.join(f'{_k}={_v}{_delimiter}' for _k, _v in _item.items())[:-1] elif isinstance(_item, str) and _item.strip().lower() == 'variables': _res = f'self.__url = {self.__url}, ' _res += f'self.__credentials = {self.__credentials}, ' _res += f'self.__number = {self.__number}, ' _res += f'self.__unit = {self.__unit}, ' _res += f'self.__verbose = {self.__verbose}, ' _res += f'self.__response = {self.__response}, ' _res += f'self.__params = {self.__params}, ' _res += f'self.__soup = {self.__soup}, ' _res += f'self.__session = {self.__session}, ' _res += f'self.__authorization = {self.__authorization}, ' _res += f'self.__user_agent = {self.__user_agent}, ' _res += f'self.__username = {self.__username}, ' _res += f'self.__password = {self.__password}, ' _res += f'self.__ans = {self.__ans}, ' _res += f'self.__pages = {self.__pages}, ' _res += f'self.__total = {self.__total}, ' else: _res = f'{str(_item)}' return _res # + # method: get_request() # - def get_request(self): """ get data from web-site """ # noinspection PyBroadException try: if self.__session is not None: _requests = self.__session.get( url=self.__url, headers=self.__user_agent, params=self.__params, ) else: _requests = requests.get( url=self.__url, headers=self.__user_agent, params=self.__params, auth=(self.__username, self.__password), ) except Exception as e: if self.__verbose: self.__log.error( f"Failed calling self.get_request(), error={e}") return None # return data if _requests.status_code != 200 or _requests.text.strip() == '': if self.__verbose: self.__log.error( f"Bad response (code={_requests.status_code}, text='{_requests.text[1:80]}...')" ) return None else: return _requests # + # method: get_records() # - def get_records(self): """ scrape records from soup """ # get the results table and extract rows we want _table = self.__soup.find_all( 'table', attrs={'class': 'results-table sticky-enabled'}) _repe = [ _e.find_all('tr', attrs={'class': 'row-even public even'}) for _e in _table ][0] _repo = [ _e.find_all('tr', attrs={'class': 'row-even public odd'}) for _e in _table ][0] _rope = [ _e.find_all('tr', attrs={'class': 'row-odd public even'}) for _e in _table ][0] _ropo = [ _e.find_all('tr', attrs={'class': 'row-odd public odd'}) for _e in _table ][0] if self.__verbose: self.__log.info(f"len(_repe)={len(_repe)}") self.__log.info(f"len(_repo)={len(_repo)}") self.__log.info(f"len(_rope)={len(_rope)}") self.__log.info(f"len(_ropo)={len(_ropo)}") _evens = set().union(_repe, _repo) _odds = set().union(_rope, _ropo) if self.__verbose: self.__log.info(f"len(_evens)={len(_evens)}") self.__log.info(f"len(_odds)={len(_odds)}") _rows = list(set().union(_evens, _odds)) if self.__verbose: self.__log.info(f"len(_rows)={len(_rows)}") # scrape each row which should look like this for _e in _rows: # ignore elements that have no find original_all if not (hasattr(_e, 'find') or hasattr(_e, 'find_all')): continue # initialize a dictionary if self.__verbose: self.__log.info(f"scraping row {_e}") _ans_tmp = {} try: # <td class="cell-id">6565</td> _ans_tmp['tns_id'] = _e.find('td', attrs={ 'class': 'cell-id' }).text.strip() except Exception: _ans_tmp['tns_id'] = '' try: # <td class="cell-name"><a href="/object/2015z">SN 2015Z</a></td> _ans_tmp['tns_name'] = (_e.find( 'td', attrs={'class': 'cell-name'})).find('a').text.strip() _link = (_e.find('td', attrs={'class': 'cell-name'})).find('a', href=True)['href'] _ans_tmp['tns_link'] = f"{DEFAULT_BASE_URL}{_link}" except Exception: _ans_tmp['tns_name'] = '' _ans_tmp['tns_link'] = '' try: # <td class="cell-reps">1<a class="cert-open" href="/object/2019oel/discovery-cert" rel="43659"></a> # <a class="at-reps-open clearfix" href="/%23" rel="43659"></a></td> _cert = (_e.find('td', attrs={'class': 'cell-reps'})).find('a', href=True)['href'] _ans_tmp['tns_cert'] = f"{DEFAULT_BASE_URL}{_cert}" except Exception: _ans_tmp['tns_cert'] = '' try: # <td class="cell-class"></td> _ans_tmp['tns_class'] = _e.find('td', attrs={ 'class': 'cell-class' }).text.strip() except Exception: _ans_tmp['tns_class'] = '' try: # <td class="cell-ra">17:18:23.982</td> _ans_tmp['ra'] = _e.find('td', attrs={ 'class': 'cell-ra' }).text.strip() except Exception: _ans_tmp['ra'] = '' try: # <td class="cell-decl">-31:04:29.63</td> _ans_tmp['decl'] = _e.find('td', attrs={ 'class': 'cell-decl' }).text.strip() except Exception: _ans_tmp['decl'] = '' try: # <td class="cell-ot_name"></td> _ans_tmp['ot_name'] = _e.find('td', attrs={ 'class': 'cell-ot_name' }).text.strip() except Exception: _ans_tmp['ot_name'] = '' try: # <td class="cell-redshift"></td> _ans_tmp['redshift'] = _e.find('td', attrs={ 'class': 'cell-redshift' }).text.strip() except Exception: _ans_tmp['redshift'] = '' try: # <td class="cell-hostname"></td> _ans_tmp['hostname'] = _e.find('td', attrs={ 'class': 'cell-hostname' }).text.strip() except Exception: _ans_tmp['hostname'] = '' try: # <td class="cell-host_redshift"></td> _ans_tmp['host_redshift'] = _e.find('td', attrs={ 'class': 'cell-host_redshift' }).text.strip() except Exception: _ans_tmp['host_redshift'] = '' try: # <td class="cell-source_group_name">ATLAS</td> _ans_tmp['source_group'] = _e.find('td', attrs={ 'class': 'cell-source_group_name' }).text.strip() except Exception: _ans_tmp['source_group'] = '' try: # <td class="cell-classifying_source_group_name"></td> _ans_tmp['classifying_group'] = _e.find( 'td', attrs={ 'class': 'cell-classifying_source_group_name' }).text.strip() except Exception: _ans_tmp['classifying_group'] = '' try: # <td class="cell-groups">ATLAS</td> _ans_tmp['groups'] = _e.find('td', attrs={ 'class': 'cell-groups' }).text.strip() except Exception: _ans_tmp['groups'] = '' try: # <td class="cell-internal_name">ATLAS19svo</td> _ans_tmp['internal_name'] = _e.find('td', attrs={ 'class': 'cell-internal_name' }).text.strip() except Exception: _ans_tmp['internal_name'] = '' try: # <td class="cell-discovering_instrument_name">ATLAS1 - ACAM1</td> _ans_tmp['instrument_name'] = _e.find( 'td', attrs={ 'class': 'cell-discovering_instrument_name' }).text.strip() except Exception: _ans_tmp['instrument_name'] = '' try: # <td class="cell-classifing_instrument_name"></td> _ans_tmp['classifying_instrument'] = _e.find( 'td', attrs={ 'class': 'cell-classifing_instrument_name' }).text.strip() except Exception: _ans_tmp['classifying_instrument'] = '' try: # <td class="cell-isTNS_AT">Y</td> _ans_tmp['isTNS_AT'] = _e.find('td', attrs={ 'class': 'cell-isTNS_AT' }).text.strip() except Exception: _ans_tmp['isTNS_AT'] = '' try: # <td class="cell-public">Y</td> _ans_tmp['public'] = _e.find('td', attrs={ 'class': 'cell-public' }).text.strip() except Exception: _ans_tmp['public'] = '' try: # <td class="cell-end_prop_period"></td> _ans_tmp['end_prop_period'] = _e.find( 'td', attrs={ 'class': 'cell-end_prop_period' }).text.strip() except Exception: _ans_tmp['end_prop_period'] = '' try: # <td class="cell-spectra_count"></td> _ans_tmp['spectra_count'] = _e.find('td', attrs={ 'class': 'cell-spectra_count' }).text.strip() except Exception: _ans_tmp['spectra_count'] = '' try: # <td class="cell-discoverymag">17.775</td> _ans_tmp['mag'] = _e.find('td', attrs={ 'class': 'cell-discoverymag' }).text.strip() except Exception: _ans_tmp['mag'] = '' try: # <td class="cell-disc_filter_name">orange-ATLAS</td> _ans_tmp['filter'] = _e.find('td', attrs={ 'class': 'cell-disc_filter_name' }).text.strip() except Exception: _ans_tmp['filter'] = '' try: # <td class="cell-discoverydate">2019-08-22 06:59:02</td> _ans_tmp['date'] = _e.find('td', attrs={ 'class': 'cell-discoverydate' }).text.strip() except Exception: _ans_tmp['date'] = '' try: # <td class="cell-discoverer">ATLAS_Bot1</td> _ans_tmp['discoverer'] = _e.find('td', attrs={ 'class': 'cell-discoverer' }).text.strip() except Exception: _ans_tmp['discoverer'] = '' try: # <td class="cell-remarks"></td> _ans_tmp['remarks'] = _e.find('td', attrs={ 'class': 'cell-remarks' }).text.strip() except Exception: _ans_tmp['remarks'] = '' try: # <td class="cell-sources"></td> _ans_tmp['sources'] = _e.find('td', attrs={ 'class': 'cell-sources' }).text.strip() except Exception: _ans_tmp['sources'] = '' try: # <td class="cell-bibcode"></td> _ans_tmp['bibcode'] = _e.find('td', attrs={ 'class': 'cell-bibcode' }).text.strip() except Exception: _ans_tmp['bibcode'] = '' try: # self.__log.info(f"_ans_tmp['bibcode']={_ans_tmp['bibcode']}") _ans_tmp['catalogs'] = _e.find('td', attrs={ 'class': 'cell-ext_catalogs' }).text.strip() except Exception: _ans_tmp['catalogs'] = '' # add it to the result(s) if _ans_tmp['tns_id'] != '' and _ans_tmp[ 'tns_name'] != '' and _ans_tmp['ra'] != '' and _ans_tmp[ 'decl'] != '': if self.__verbose: self.__log.debug(f"scraped row {_ans_tmp}") self.__ans.append(_ans_tmp) else: if self.__verbose: self.__log.debug(f"ignoring {_ans_tmp}") # + # method: get_soup() # - def get_soup(self, _page=0): """ scrape web-site page """ # set default(s) self.__params['page'] = _page if (isinstance(_page, int) and _page > 0) else 0 if self.__verbose: self.__log.debug(f'self.__params={self.__params}') # get request self.__response = self.get_request() if self.__verbose: self.__log.debug(f'self.__response={self.__response}') # get encoding _http_encoding = self.__response.encoding if 'charset' in self.__response.headers.get( 'content-type', '').lower() else None _html_encoding = EncodingDetector.find_declared_encoding( self.__response.content, is_html=True) # get soup self.__soup = None try: if self.__verbose: self.__log.debug(f'Getting soup from self.__response.text') self.__soup = BeautifulSoup(self.__response.text, features='html5lib', from_encoding=(_html_encoding or _http_encoding)) if self.__verbose: self.__log.debug(f'Got soup from self.__response.text OK') except Exception as e: self.__soup = None if self.__verbose: self.__log.error( f'Failed to get soup from self.__response.text, error={e}') # + # method: scrape_tns_pages() # - def scrape_tns_pages(self): """ scrape web-site for tns pages """ # set default(s) self.__ans = [] self.__pages = -1 self.__total = -1 self.__url = DEFAULT_SEARCH_URL self.__params = DEFAULT_SEARCH_PARAMS self.__params['discovered_period_value'] = f'{self.__number}' self.__params['discovered_period_units'] = f'{self.__unit}' # get soup self.get_soup() if self.__verbose: self.__log.debug(f'type(self.__soup)={type(self.__soup)}') # get max number of results by scraping if self.__soup is not None: _div = self.__soup.find_all('div', attrs={'class': 'count rsults'}) _ems = [ _e.find_all('em', attrs={'class': 'placeholder'}) for _e in _div ][0] self.__total = int(_ems[-1].text) if self.__verbose: self.__log.debug(f'self.__total={self.__total}') self.__log.debug( f"self.__params['num_page']={self.__params['num_page']}") else: return self.__total, self.__ans # calculate pages self.__pages = math.ceil( int(self.__total) / int(self.__params['num_page'])) if self.__verbose: self.__log.debug(f'self.__pages={self.__pages}') # get record(s) for page 0 self.get_records() # get record(s) for other pages if self.__pages > 0: for _i in range(1, self.__pages): self.get_soup(_i) self.get_records() if self.__verbose: self.__log.debug(f'sleeping for 5 seconds ...') time.sleep(5) # return result if self.__verbose: self.__log.debug(f'self.__total = {self.__total}') self.__log.debug(f'len(self.__ans) = {len(self.__ans)}') return self.__total, self.__ans