def __init__(self, access_key=None, secret=None, failed_table_name='parsed_simple_failed', missing_table_name='parsed_simple_missing', success_table_name='parsed_simple_good'): """ If access_key and/or secret are not passed in, assumes we are accessing erenev's aws account and that the access info is stored as environment variables on the current server. Connection and Table are available to clients via self properties, in case clients wish to use those objects directly. """ access_key = access_key or get_environment_variable( 'VEN_S3_ACCESS_KEY') secret = secret or get_environment_variable('VEN_S3_SECRET') self.connection = boto.dynamodb2.connect_to_region( region_name='eu-west-1', aws_access_key_id=access_key, aws_secret_access_key=secret) self.failed_table = Table(failed_table_name, connection=self.connection) self.missing_table = Table(missing_table_name, connection=self.connection) self.success_table = Table(success_table_name, connection=self.connection)
def with_ven_anonymizer(cls): return Browser(proxies={ 'http': get_environment_variable('VEN_ANONYMIZER_PROX_HTTP') }, auth=HTTPProxyAuth( get_environment_variable('VEN_ANONYMIZER_LOGIN'), get_environment_variable('VEN_ANONYMIZER_PASS')))
def __init__(self, to_addresses=None, sender=None, aws_key=None, aws_secret=None): self.sender = sender or 'Thor Stats <*****@*****.**>' self.to_addresses = to_addresses or '*****@*****.**' self.aws_key = aws_key or get_environment_variable('VEN_S3_ACCESS_KEY') self.aws_secret = aws_secret or get_environment_variable( 'VEN_S3_SECRET')
def __init__(self, bucket_name, region=S3_DEFAULT_REGION): """ Creates an instance with a handle on the S3 bucket corresponding to bucket_name. If bucket_name does not exist, it is created. :type bucket_name: str """ self.conn = boto.s3.connect_to_region( region, aws_access_key_id=get_environment_variable('VEN_S3_SECRET'), aws_secret_access_key=get_environment_variable( 'VEN_S3_ACCESS_KEY')) if bucket_name not in [b.name for b in self.conn.get_all_buckets()]: self.conn.create_bucket(bucket_name, location=Location.EU) self.bucket = self.conn.get_bucket(bucket_name)
def datapath(filename=''): # NOTE: Tried os.path.join first, but noticed that if filename started with a /, we didn't get the MS_DATA root. # so for simplicity and (??) performance, chose to use concatination # protecting filename from None filename = filename or '' return get_environment_variable('MS_DATA') + filename
def __init__(self, account_id='7998744469', store=None, min_money=None, max_money=None, logging_level=logging.DEBUG, chk_size=5000): """ Pass in min and max (in Euros, NOT micros) based bid if you want to override the GMoney defaults store: storage that acts like an HDF5 store """ self.conn = Connection(password=get_environment_variable('VEN_ADWORDS_PASSWORD'), developer_token=get_environment_variable('VEN_ADWORDS_TOKEN'), account_id=account_id) self.awq = AWQ(self.conn) self.gmoney = GMoney(min_money=min_money, max_money=max_money) self.ops = Operations(self.gmoney) self.mutations = Mutations(self.conn) self.chk_size = chk_size self.store = store self.logger = KhanLogger(level=logging_level, origin=self.__class__.__name__) # convenience properties to make it easier to call methods in succession without tracking job ids self.job_ids = None self.job_ids_completed = None self.job_ids_failed = None
def __init__(self, bucket_name=None, base_folder=None, extension=None, force_extension=False, encoding=None, access_key=None, secret=None): """ Creates an instance with a handle on the S3 bucket corresponding to bucket_name. If access_key and/or secret are not passed in, assumes we are accessing erenev's aws account and that the access info is stored as environment variables on the current server. Connection and bucket are available to clients via self properties, in case clients wish to use those objects directly. """ assert not extension, "extension has not been implement yet for S3." if access_key and not secret: if access_key == 'ut': access_key = get_environment_variable('VEN_AWS_ACCESS_KEY_ID') secret = get_environment_variable('VEN_AWS_SECRET_ACCESS_KEY') elif access_key == 'mon': access_key = get_environment_variable('MON_AWS_ACCESS_KEY_ID') secret = get_environment_variable('MON_AWS_SECRET_ACCESS_KEY') else: ValueError('I cannot recognize that access_key') else: # if access_key is not given, take a default #access_key = access_key or os.environ['MON_AWS_ACCESS_KEY_ID'] #secret = secret or os.environ['MON_AWS_SECRET_ACCESS_KEY'] access_key = access_key or get_environment_variable( 'VEN_AWS_ACCESS_KEY_ID') secret = secret or get_environment_variable( 'VEN_AWS_SECRET_ACCESS_KEY') # note - this calls the setter self.base_folder = base_folder self.extension = extension self.force_extension = force_extension self.encoding = encoding self.connection = S3Connection(access_key, secret, host='s3-eu-west-1.amazonaws.com') if bucket_name: self.bucket = self.connection.get_bucket(bucket_name) else: self.bucket = None
__author__ = 'thorwhalen' from ut.datapath import datapath import pickle import os from ut.util.importing import get_environment_variable import pandas as pd import ut.pfile.to as file_to import ut.pfile.name as pfile_name import ut.pstr.to as str_to from ut.pstr.trans import str_to_unicode_or_bust #from os import environ # does this load the whole array? Can we just take MS_DATA instead? try: MS_DATA = get_environment_variable('MS_DATA') except KeyError: MS_DATA = '' ENCODING_NONE = 0 ENCODING_UNICODE = 1 ENCODING_UTF8 = 2 class Local(object): def __init__(self, relative_root=None, extension='', force_extension=False, encoding='UTF-8', mother_root=MS_DATA): if relative_root:
__author__ = 'thor' import ut.pfile.accessor as pfile_accessor import os from ut.util.importing import get_environment_variable import pdb PROJECT_PATH = get_environment_variable('PY_PROJ_FOLDER') #import os #PROJECT_PATH = os.path.dirname(os.path.dirname(os.path.dirname(os.path.realpath(__file__)))) # THOR_KHAN_CODE_PATH = "/D/Dropbox/dev/py/proj/khan/" # MATT_KHAN_CODE_PATH = "/Users/mattjmorris/Dev/python/khan/" pdbrc_file = '.pdbrc' # constant .pdbrc aliases # define pdbrc_prefix pdbrc_prefix = "##### PDBRC PREFIX ###########################" + "\n" pdbrc_prefix = pdbrc_prefix + "\n" + "# Print a dictionary, sorted. %1 is the dict, %2 is the prefix for the names." pdbrc_prefix = pdbrc_prefix + "\n" + 'alias p_ for k in sorted(%1.keys()): print "%s%-15s= %-80.80s" % ("%2",k,repr(%1[k]))' pdbrc_prefix = pdbrc_prefix + "\n" + "" pdbrc_prefix = pdbrc_prefix + "\n" + "# Print the instance variables of a thing." pdbrc_prefix = pdbrc_prefix + "\n" + "alias pi p_ %1.__dict__ %1." pdbrc_prefix = pdbrc_prefix + "\n" + "" pdbrc_prefix = pdbrc_prefix + "\n" + "# Print the instance variables of self." pdbrc_prefix = pdbrc_prefix + "\n" + "alias ps pi self" pdbrc_prefix = pdbrc_prefix + "\n" + "" pdbrc_prefix = pdbrc_prefix + "\n" + "# Print the locals."
class Browser(object): """ Base class for slurpers """ # Load up scraper config stuff from config file upon loading class definition CONFIG = json.load(open(os.path.dirname(__file__) + "/config.json")) USER_AGENTS = CONFIG["USER_AGENTS"] HEADERS = CONFIG["HEADERS"] try: ANONYMIZER_AUTH = HTTPProxyAuth( get_environment_variable('VEN_ANONYMIZER_LOGIN'), get_environment_variable('VEN_ANONYMIZER_PASS')) ANONYMIZER_PROXIES = { 'http': get_environment_variable('VEN_ANONYMIZER_PROX_HTTP') } #,'https': get_environment_variable('VEN_ANONYMIZER_PROX_HTTPS']} except KeyError: print( "VEN_ANONYMIZER_LOGIN, VEN_ANONYMIZER_PASS, and/or VEN_ANONYMIZER_PROX_HTTP missing from environment" ) try: PROXYMESH_AUTH = requests.auth.HTTPProxyAuth( get_environment_variable('PROXYMESH_USER'), get_environment_variable('PROXYMESH_PASS')) PROXYMESH_PROXIES = {'http': 'http://us.proxymesh.com:31280'} except KeyError: print("PROXYMESH_USER and/or PROXYMESH_PASS missing from environment") def __init__(self, **kwargs): """ Creates an instance that will use proxies and authorization if proxies & auth are provided. :type proxies: dict :type auth: requests.auth.HTTPProxyAuth """ # CONFIG = json.load(open(os.path.dirname(__file__) + "/config.json")) default_kwargs = { 'get_header_fun': 'header_with_random_firefox_user_agent', 'random_agents': [], 'header': { "User-Agent": "Mozilla/5.0 (compatible; MSIE 10.0; Windows NT 6.1; WOW64; Trident/6.0)", "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8", "Accept-Encoding": "gzip, deflate", "Connection": "close", "DNT": "1" }, 'proxies': None, 'auth': None, 'timeout': 10.0 } self = ut.util.pobj.set_attributes(self, kwargs, default_kwargs) # self.random_agents = [] # self.proxies = proxies # self.auth = auth # self.timeout = 10.0 # self.webdriver = None @classmethod def options(cls): print ''' get_header_fun: 'header_with_random_firefox_user_agent' (default) 'fixed_header' ''' def get_header(self): if self.get_header_fun == 'header_with_random_firefox_user_agent': return self.header_with_random_user_agent( filter_in_user_agents='^Mozilla.*') elif self.get_header_fun == 'fixed_header': return self.header else: raise ValueError("Unknown get_header_fun value") def header_with_random_user_agent(self, filter_in_user_agents=None): """ Returns a header with a random user agent """ headers = self.HEADERS.copy() headers['User-Agent'] = self.random_user_agent( filter_in_user_agents=filter_in_user_agents) return headers def random_user_agent(self, filter_in_user_agents=None): """ Returns a random user agent from the full list of user agents. Cycles through all agents before re-sampling from the full list again. """ if not self.random_agents: self.random_agents = self.USER_AGENTS[:] if filter_in_user_agents: self.random_agents = filter( re.compile(filter_in_user_agents).search, self.random_agents) random.shuffle(self.random_agents) return self.random_agents.pop() def get_html_through_tor_unfinished(self, url): UserWarning("This hasn't really be coded yet...") proxy_support = urllib2.ProxyHandler({"http": "127.0.0.1:8118"}) opener = urllib2.build_opener(proxy_support) opener.addheaders = [('User-agent', self.random_user_agent())] return opener.open(url).read() def get_html_through_requests(self, url, url_params={}, timeout=None): r = self.get_response_through_requests(url=url, url_params=url_params, timeout=timeout) # return the text if no error if not r.ok: raise ValueError('HTTP Error: {} for url {} (headers="{}"'.format( r.status_code, url, str(r.headers))) else: return r.text def get_response_through_requests(self, url, url_params={}, timeout=None): timeout = timeout or self.timeout header = self.header_with_random_user_agent() # get the content for the url r = requests.get(url, headers=header, params=url_params, timeout=timeout, proxies=self.proxies, auth=self.auth) return r def get_html_through_selenium(self, url, url_params={}, timeout=None): ValueError("You haven't written this yet!!") pass # timeout = timeout or self.timeout # header = self.header_with_random_user_agent() # # get the content for the url # r = requests.get(url, headers=header, params=url_params, # timeout=timeout, proxies=self.proxies, auth=self.auth) # # return the text if no error # if not r.ok: # raise ValueError('HTTP Error: {} for url {} (user-agent="{}"'.format(r.status_code, url, header['User-Agent'] )) # else: # return r.text @classmethod def with_ven_anonymizer(cls): return Browser(proxies={ 'http': get_environment_variable('VEN_ANONYMIZER_PROX_HTTP') }, auth=HTTPProxyAuth( get_environment_variable('VEN_ANONYMIZER_LOGIN'), get_environment_variable('VEN_ANONYMIZER_PASS'))) @classmethod def with_proxymesh(cls): return Browser(proxies=cls.PROXYMESH_PROXIES, auth=cls.PROXYMESH_AUTH) @classmethod def firefox_selenium(cls, **kwargs): default_kwargs = {'webdriver': webdriver.Firefox()} kwargs = dict(default_kwargs, **kwargs) return Browser(**kwargs)
class Amazon(object): url_template = dict() url_template['product_page'] = 'http://www.amazon.{country}/dp/{asin}/' url_template['product_reviews'] = 'http://www.amazon.{country}/product-reviews/{asin}/' regexp = dict() regexp['nreviews_re'] = {'com': re.compile('\d[\d,]*(?= customer review)'), 'co.uk': re.compile('\d[\d,]*(?= customer review)'), 'in': re.compile('\d[\d,]*(?= customer review)'), 'de': re.compile('\d[\d\.]*(?= Kundenrezens\w\w)')} regexp['no_reviews_re'] = {'com': re.compile('no customer reviews'), 'co.uk': re.compile('no customer reviews'), 'in': re.compile('no customer reviews'), 'de': re.compile('Noch keine Kundenrezensionen')} # regexp['average_rating_re'] = {'com': re.compile('')} default = dict() default['country'] = 'com' # default['requests_kwargs'] = {} default['requests_kwargs'] = { 'proxies': {'http': 'http://us.proxymesh.com:31280'}, 'auth': requests.auth.HTTPProxyAuth(get_environment_variable('PROXYMESH_USER'), get_environment_variable('PROXYMESH_PASS')) } @classmethod def url(cls, what='product_page', **kwargs): kwargs = dict(Amazon.default, **kwargs) return cls.url_template[what].format(**kwargs) return r.text @classmethod def slurp(cls, what='product_page', **kwargs): kwargs = dict(Amazon.default, **kwargs) r = requests.get(Amazon.url(what=what, **kwargs), **Amazon.default['requests_kwargs']) if r.status_code == 200: return r.text else: # try again and return no matter what r = requests.get(Amazon.url(what=what, **kwargs), **Amazon.default['requests_kwargs']) return r.text # @classmethod # def get_dynamic_book_info(cls, asin, **kwargs): # html = Amazon.slurp(what='product_page', **kwargs) # b = bs3_BeautifulSoup(b) @classmethod def get_info(cls, asin, country='co.uk', **kwargs): info = {'date': datetime.now()} info = dict(info, **{'sales_ranks': cls.get_sales_rank(asin, country='co.uk', **kwargs)}) # info = dict(info, **{'num_of_reviews': cls.get_number_of_reviews(asin, country='co.uk', **kwargs)}) return info @classmethod def get_sales_rank(cls, **kwargs): html = Amazon.slurp(what='product_page', **kwargs) sales_rank = [Amazon.parse_sales_rank(html, **kwargs)] sales_rank += Amazon.parse_sales_sub_rank(html, **kwargs) return sales_rank @classmethod def parse_product_title(cls, b, **kwargs): if not isinstance(b, bs3_BeautifulSoup): b = bs3_BeautifulSoup(b) return b.find('span', attrs={'id': 'productTitle'}).text @classmethod def parse_sales_rank(cls, b, **kwargs): if not isinstance(b, bs3_BeautifulSoup): b = bs3_BeautifulSoup(b) t = b.find('li', attrs={'id': re.compile('SalesRank')}) sales_rank_re = re.compile('(\d[\d,]+) in ([\w\ ]+)') tt = sales_rank_re.findall(t.text) return {'sales_rank': int(re.compile('\D').sub('', tt[0][0])), 'sales_rank_category': tt[0][1].strip(' ')} @classmethod def parse_sales_sub_rank(cls, b, **kwargs): if not isinstance(b, bs3_BeautifulSoup): b = bs3_BeautifulSoup(b) t = b.find('li', attrs={'id': re.compile('SalesRank')}) tt = t.findAll('li', 'zg_hrsr_item') sales_sub_rank = list() for tti in tt: d = dict() d['sales_rank'] = int(re.compile('\D').sub('', tti.find('span', 'zg_hrsr_rank').text)) ttt = tti.find('span', 'zg_hrsr_ladder') ttt = ttt.text.split(' ')[1] d['sales_rank_category'] = ttt.split('>') sales_sub_rank.append(d) return sales_sub_rank @classmethod def parse_avg_rating(cls, b, **kwargs): if not isinstance(b, bs3_BeautifulSoup): b = bs3_BeautifulSoup(b) t = b.find('span', 'reviewCountTextLinkedHistogram') return float(re.compile('[\d\.]+').findall(t['title'])[0]) @classmethod def parse_product_title(cls, b, **kwargs): if not isinstance(b, bs3_BeautifulSoup): b = bs3_BeautifulSoup(b) t = b.find('div', attrs={'id': 'title'}) return t.find('span', attrs={'id': 'productTitle'}).text @staticmethod def test_rating_scrape_with_vanessas_book(): html = Amazon.slurp(what='product_page', country_ext='.co.uk', asin='1857886127') @staticmethod def get_number_of_reviews(asin, country, **kwargs): url = 'http://www.amazon.{country}/product-reviews/{asin}'.format(country=country, asin=asin) html = requests.get(url).text try: return int(re.compile('\D').sub('', Amazon.regexp['nreviews_re'][country].search(html).group(0))) except Exception: if Amazon.regexp['no_reviews_re'][country].search(html): return 0 else: return None # to distinguish from 0, and handle more cases if necessary
url_from_filename ''' from ut.util.importing import get_environment_variable import logging import urllib2 import urlparse from bs4 import BeautifulSoup from BeautifulSoup import BeautifulSoup as bs3_BeautifulSoup import requests from selenium import webdriver # from selenium.webdriver.common.keys import Keys import ut import ut.parse.util from ut.parse.util import disp_html import ut.util.log import ut.webscrape.util from ut.webscrape.util import filename_from_url from ut.webscrape.util import url_from_filename from ut.parse.util import open_html_in_firefox logging.basicConfig(filename=get_environment_variable('DEFAULT_LOG_FILE'), filemode='w', level=logging.DEBUG) print "logging in use:\n %s" % get_environment_variable('DEFAULT_LOG_FILE')
class Yboss(object): default_yboss_attrs = { 'oauth_consumer_key': get_environment_variable('MON_YB_KEY'), 'oauth_consumer_secret': get_environment_variable('MON_YB_SECRET'), 'default_service': 'limitedweb', 'default_params': {}, 'default_save_folder': os.getcwd() } def __init__(self, **kwargs): self.__dict__.update(Yboss.default_yboss_attrs) self.__dict__.update(kwargs) self.consumer = oauth2.Consumer(key=self.oauth_consumer_key, secret=self.oauth_consumer_secret) #################################################################################### ###### SLURPERS #################################################################### def slurp_raw(self, query, service=None, params=None): service, params = self.fill_with_defaults(service, params) url = self.url(query, service, params) request_params = { 'oauth_version': '1.0', 'oauth_nonce': oauth2.generate_nonce(), 'oauth_timestamp': int(time.time()), } oauth_request = oauth2.Request(method='GET', url=url, parameters=request_params) oauth_request.sign_request(oauth2.SignatureMethod_HMAC_SHA1(), self.consumer, None) oauth_header = oauth_request.to_header(realm='yahooapis.com') # Get search results http = httplib2.Http() resp, content = http.request(url, 'GET', headers=oauth_header) return {'content': content, 'resp': resp} # return "{'resp': %s, 'content': %s}" % (resp, content) def slurp_content(self, query, service=None, params=None): resp_content = self.slurp_raw(query, service=service, params=params) return resp_content['content'] def slurp_content_as_dict(self, query, service=None, params=None): return json.loads( self.slurp_content(query, service=service, params=params)) def slurp_content_and_save(self, query, service=None, params=None, filepath=None): filepath = self.get_filepath_for_params(query=query, service=service, params=params, filepath=filepath) resp_content = self.slurp_raw(query, service=service, params=params) json.dump(resp_content['content'], open(filepath, 'w')) def slurp_df_and_save(self, query, service=None, params=None, filepath=None, n_pages=1): filepath = self.get_filepath_for_params(query=query, service=service, params=params, filepath=filepath) df = self.slurp_results_df_multiple_pages(query=query, service=service, params=params, n_pages=n_pages) pd.to_pickle(df, filepath) return df def get_df(self, query, service=None, params=None, filepath=None, n_pages=1, overwrite=False): filepath = self.get_filepath_for_params(query=query, service=service, params=params, filepath=filepath) if not overwrite and os.path.exists(filepath): return pd.read_pickle(filepath) else: return self.slurp_df_and_save(query=query, service=service, params=params, n_pages=n_pages) def slurp_results_df(self, query, service=None, params=None): content_dict = json.loads( self.slurp_content(query, service=service, params=params)) content_dict = self.get_item(content_dict) return self.content_to_results_df(content_dict) def content_to_results_df(self, content_dict): df = pd.DataFrame(content_dict['results']) start_position = int(content_dict['start']) df['position'] = range(start_position, start_position + len(df)) df._metadata = {'totalresults': int(content_dict['totalresults'])} return df def slurp_results_df_multiple_pages(self, query, service=None, params=None, n_pages=5): service, params = self.fill_with_defaults(service, params) df = pd.DataFrame() new_df = pd.DataFrame() for i in range(n_pages): # print "slurping %d/%d" % (i, n_pages-1) try: new_df = self.slurp_results_df(query, service=service, params=params) df = pd.concat([df, new_df]) except: break params['start'] += params['count'] df._metadata = new_df._metadata return df #################################################################################### ###### UTILS ####################################################################### #################################################################################### ###### CONTENT ACCESSORS ########################################################### def load_json_dict(self, filepath): filepath = self.get_filepath(filepath) return json.loads(json.load(open(filepath, 'r'))) def get_service_results_getter(self, service): service = service or self.default_save_folder return mk_fixed_coordinates_value_getter(['bossresponse', service]) @classmethod def get_results(cls, content_dict): return Yboss.get_item(content_dict)['results'] # def get_results @classmethod def get_totalresults(cls, content_dict): return int(Yboss.get_item(content_dict)['totalresults']) @classmethod def get_item(cls, content_dict): content_dict = content_dict['bossresponse'] return content_dict[content_dict.keys()[0]] @classmethod def mk_fixed_coordinates_value_getter(cls, coord_list): return mk_fixed_coordinates_value_getter(['bossresponse'] + coord_list) #################################################################################### ###### UTILS ####################################################################### def url(self, query, service=None, params=None): service = service or self.default_service params = params or self.default_params return yboss_search_root_url + self.rel_url( query=query, service=service, params=params) def rel_url(self, query, service=None, params=None): service = service or self.default_service params = params or self.default_params params = Yboss.mk_req_params(service, params) return "%s?q=%s%s" % (service, self.url_encode_str(query), self.url_encode_params(params)) def get_filename_for_query(self, query, service=None, params=None): return self.rel_url(query, service=service, params=params).replace('?', '--') def get_filepath(self, filespec): if os.path.exists(filespec): file_path = filespec else: file_path = os.path.join(self.default_save_folder, filespec) if not os.path.exists(file_path): # assume it's a query, and derive what the filepath should be file_path = os.path.join(self.default_save_folder, self.get_filename_for_query(filespec)) return file_path def get_filepath_for_params(self, query, service=None, params=None, filepath=None): filepath = filepath or self.default_save_folder if os.path.isdir( filepath ): # if filepath is a directory, need to make a filename for it filepath = os.path.join( filepath, self.get_filename_for_query(query, service=service, params=params)) return filepath def fill_with_defaults(self, service=None, params=None): service = service or self.default_service params = Yboss.mk_req_params(service, params) return service, params @classmethod def mk_req_params(cls, service, params=None): params = params or {} return dict( dict(default_universal_args, **service_default_req_args[service]), **params) @classmethod def url_encode_str(cls, s): return url_encode_yboss(s) @classmethod def url_encode_params(cls, params): u = '' for p, v in params.iteritems(): if isinstance(v, basestring): u += '&%s=%s' % (p, v) else: u += '&%s=%s' % (p, str(v)) return u @classmethod def print_some_resources(cls): print ''' guide to yahoo BOSS: http://developer.yahoo.com/boss/search/boss_api_guide/index.html pricing (by service): http://developer.yahoo.com/boss/search/#pricing services: web, limitedweb, images, news, blogs, related response fields: http://developer.yahoo.com/boss/search/boss_api_guide/webv2_response.html market and languages: http://developer.yahoo.com/boss/search/boss_api_guide/supp_regions_lang.html ''' @classmethod def process_df(cls, df): df['dispurl'] = df['dispurl'].map(YbossText.remove_html_bold) df['title'] = df['title'].apply(YbossText.html2text) df['abstract'] = df['abstract'].apply(YbossText.html2text) df = ms.daf.manip.reorder_columns_as(df, major_cols + minor_cols) df = df.reset_index(drop=True) return df
'style': 'raw' }, # age, sort, title, url 'blogs': { 'count': 20, 'style': 'raw' }, # age, sort, count, title, url 'related': { 'count': 10 }, # age, sort, count, title, url 'images': { 'count': 35 } # filter, queryfilter, dimensions, referurl, url } default_yboss_attrs = { 'oauth_consumer_key': get_environment_variable('MON_YB_KEY'), 'oauth_consumer_secret': get_environment_variable('MON_YB_SECRET'), 'default_service': 'limitedweb', 'default_params': {}, 'default_save_folder': os.getcwd() } service_list = ['limitedweb', 'web', 'blogs', 'news', 'related', 'images'] major_cols = [ 'query', 'position', 'title', 'abstract', 'dispurl', 'num_of_slurped_results', 'author' ] minor_cols = ['date', 'url', 'clickurl']