Ejemplo n.º 1
0
    def get_last_job_ids(self):
        project_id = os.environ.get("SCRAPY_PROJECT_ID")
        api_key = self.spider.settings.get("SCRAPINGHUB_API_KEY")

        if not project_id or not api_key:
            return []

        client = ScrapinghubClient(api_key)
        project = client.get_project(project_id)
        jobs = project.jobs.list()

        if not jobs:
            return []

        # find last job for spider searchterm same spider
        # can be invoked with different searchterms
        last_matching_job = None

        for each in jobs:
            key = each["key"]
            job = client.get_job(key)

            metadata = dict(job.metadata.list())
            searchterm = metadata.get("spider_args", {}).get("searchterm", "")

            if self.spider.searchterm == searchterm:
                last_matching_job = job
                break

        if not last_matching_job:
            return []

        return [item["id"] for item in last_matching_job.items.iter()]
Ejemplo n.º 2
0
def create_json_schema(source_key: str,
                       item_numbers: List[int] = None) -> dict:
    client = ScrapinghubClient()
    if helpers.is_collection_key(source_key):
        store = api.get_collection(source_key)
        items_count = store.count()
    elif helpers.is_job_key(source_key):
        job = client.get_job(source_key)
        items_count = api.get_items_count(job)
        store = job.items
    else:
        logger.error(f"{source_key} is not a job or collection key")
        return

    if items_count == 0:
        logger.error(f"{source_key} does not have any items")
        return

    item_n_err = "{} is a bad item number, choose numbers between 0 and {}"
    if item_numbers:
        item_numbers.sort()
        if item_numbers[-1] >= items_count or item_numbers[0] < 0:
            logger.error(item_n_err.format(item_numbers[-1], items_count - 1))
            return
    else:
        item_numbers = set_item_no(items_count)

    samples = []
    for n in item_numbers:
        items = api.get_items(source_key, start_index=n, count=1)
        samples.append(items[0])

    return infer_schema(samples)
Ejemplo n.º 3
0
 def ready(self):
     global test
     apikey = '88133cc793ab4296b56db8a87eaae1ec'
     client = ScrapinghubClient(apikey)
     test = client.get_job('223795/1/3')
     test = sorted(test.items.list(),
                   key=lambda k: k['score'],
                   reverse=True)
Ejemplo n.º 4
0
def showBooks(request):
    global job
    job = test
    if job is None:
        print("adgaegae")
        apikey = '88133cc793ab4296b56db8a87eaae1ec'
        client = ScrapinghubClient(apikey)
        job = client.get_job('223795/1/3')
        job = sorted(job.items.list(), key=lambda k: k['score'], reverse=True)
        return render(request, 'user_page.html', {'spider_books': job, 'user_fullname':request.user.get_full_name,'myuser_id':request.user.myuser.id})
    else:
        '''job = sorted(job.items.list(), key=lambda k: k['score'], reverse=True)'''
        return render(request, 'user_page.html',{'spider_books': job, 'user_fullname': request.user.get_full_name,'myuser_id': request.user.myuser.id})
Ejemplo n.º 5
0
def main():
    args = parse_args()
    apikey = os.environ.get('SH_APIKEY') or args.apikey
    if not apikey:
        print('Please set API key')
        exit(1)

    client = ScrapinghubClient(apikey)
    job = client.get_job(args.job)
    events = args.func(job)
    if args.command == 'errors':
        report_errors = create_errors_report(events,
                                             max_urls_for_output=(min(
                                                 args.max, 30)))
        print(report_errors)
Ejemplo n.º 6
0
def menu():
	client = ScrapinghubClient(config['scrapinghub']['api_key'])
	project = client.get_project(config['scrapinghub']['project_id'])
	job = project.jobs.list(spider=config['scrapinghub']['spider_name'], state='finished', count=1)[0]
	job = client.get_job(job['key'])

	menu = {}
	menu['aktualnosc'] = job.metadata.get('finished_time')
	menu['restauracja'] = {
		"nazwa": "CamelPizza",
		"logo": "https://www.camelpizza.pl/system/logos/27323/menu_size/1549450693.png",
		"url": "http://camelpizza.pl"
	}
	menu['grupy'] = []

	def get_grupa(item):
		for grupa in menu['grupy']:
			if grupa['nazwa'] == item['grupa']:
				return grupa
		grupa = { 'nazwa': item['grupa'], 'pozycje': [] }
		menu['grupy'].append(grupa)
		return grupa

	def get_pozycja(item):
		grupa = get_grupa(item)
		for pozycja in grupa['pozycje']:
			if pozycja['nazwa'] == item['pozycja']:
				return pozycja
		pozycja = { 'nazwa': item['pozycja'], 'opis': item['opis'], 'warianty': [] }
		grupa['pozycje'].append(pozycja)
		return pozycja

	def get_cena(item):
		kwota, waluta = item['cena'].replace(u'zł', u' zł').split()
		kwota = float(kwota.replace(',', '.'))
		waluta = waluta.replace(u'zł', 'PLN')
		return { 'kwota': kwota, 'waluta': waluta }

	items = job.items.list()
	for item in items:
		try:
			pozycja = get_pozycja(item)
			wariant = { 'opis': item['wariant'], 'ceny': [ get_cena(item) ]}
			pozycja['warianty'].append(wariant)
		except:
			print("Invalid item")

	return jsonify(menu)
Ejemplo n.º 7
0
def index():
    apikey = os.environ.get("APIKEY")
    job_id = os.environ.get("JOB_ID")

    client = ScrapinghubClient(apikey)
    job = client.get_job(job_id)

    data = []

    for item in job.items.iter():
        dict = {
            'title': item['title'][0],
            'director': item['director'][0],
            'summary': item['summary'][0]
        }
        data.append(dict)

    return render_template('index.html', data=data)
def get_data():
    client = ScrapinghubClient('<KEY>')
    project = client.get_project(441598)
    spider = project.spiders.get('state')
    job_id = list(project.activity.iter(count=2))
    job_id = job_id[1]['job']
    job = client.get_job(job_id)

    state_name = []
    death =[]
    cured =[]
    confirmed_cases=[]
    for item in job.items.iter():
        state_name.append(item[b'state'].decode("utf-8"))
        death.append(item[b'death'].decode("utf-8"))
        cured.append(item[b'cured'].decode("utf-8"))
        confirmed_cases.append(item[b'confirmed_cases'].decode("utf-8"))

    data = {'state':state_name,'death':death,'cured':cured,'confirmed_cases':confirmed_cases}
    data = pd.DataFrame(data)
    data = data[:-1]
    return data
Ejemplo n.º 9
0
def main():
    requestsMade = 0
    while requestsMade < 3:
        # running the job
        client = ScrapinghubClient(APIKEY)
        project = client.get_project(projectId)
        job = project.jobs.run(spider)

        if job.metadata.get('state') == 'running' or job.metadata.get('state') == 'pending' or job.metadata.get('state') == 'finished':
            requestsMade = 10
            
            # getting result from job
            lastFinishedJob =  project.jobs.iter(spider=spider, state='finished', count=1)

            for job in lastFinishedJob:
                lastJobId = job['key']
                jobData = client.get_job(lastJobId)
                saveToMongo(jobData.items)

        else:
            requestsMade += 1
            time.sleep(5)
Ejemplo n.º 10
0
class SHConnection():
    ''' Wrapper for scrapinghub client, project and api calls
    to simplify use.
    '''

    def __init__(self, api_key, default_project_key=None):
        self.api_key = api_key
        self.project_key = resolve_project_key(
            default_project_key=default_project_key
        )

    def __enter__(self):
        self.client = ScrapinghubClient(self.api_key)
        self.project = self.client.get_project(self.project_key)
        return self

    def __exit__(self, *args):
        self.client.close()

    def jobs_iter(self, **kwargs):
        return self.project.jobs.iter(**kwargs)

    def get_job(self, job_id):
        return self.client.get_job(job_id)
Ejemplo n.º 11
0
def get_spider_name(job_key):
    client = ScrapinghubClient()
    job = client.get_job(job_key)
    return job.metadata.get("spider")
Ejemplo n.º 12
0
client.projects.summary()

project = client.get_project(list_projects[0])

### Invoking a job 
spider = project.spiders.get(project.spiders.list()[0]['id'])

spider.jobs.summary()

last_key = list(spider.jobs.iter_last())[0]['key']



## Accessing job output data
### Project ID/Spider ID/Job ID
job = client.get_job(last_key)

# =============================================================================
# SQL Alchemy Connection to the database   
# =============================================================================

import sqlalchemy  as sqal
from sqlalchemy import MetaData , create_engine
from sqlalchemy.engine.url import URL
from sqlalchemy.orm import sessionmaker

## Establishes a DBAPI connection.
db_connect_str = URL(**postgres_key)
engine =create_engine(db_connect_str)

# reflects the schema, and produces mapping
Ejemplo n.º 13
0
 if job['state'] != 'finished':
     mail_subject = "Problema, stato job non finito"
     mail_body = "Problema, stato job non finito " + job['key']
     send_email(mail_from, mail_to_error, mail_username, mail_password, mail_server, mail_port, mail_subject, mail_body)
     exit()
 if 'items' not in job:
     mail_subject = "Problema, job non contiene elementi"
     mail_body = "Problema, job non contiene elementi " + job['key']
     send_email(mail_from, mail_to_error, mail_username, mail_password, mail_server, mail_port, mail_subject, mail_body)
     exit()
 if job['key'] in str(storico_jobs):
     # mail_subject = "Problema, job nuovo non presente"
     # mail_body = "Problema, job nuovo non presente"
     # send_email(mail_from, mail_to, mail_username, mail_password, mail_server, mail_port, mail_subject, mail_body)
     continue
 items = hc.get_job(job['key']).items.list()
 job_key = [job['key']]
 for item in items:
     lista.append((item['isin'], item['isin_titolo'], item['scadenza'], item['strike'], item['tipo_opzione'],
                   item['volume_contratti'], item['volatilita_implicita']))
 run_time = job['running_time'] / 1000
 data = datetime.datetime.fromtimestamp(run_time).strftime("%Y%m%d")
 if server == 'remoto':
     directory = dir + lista[0][1] + "/opzioni/"
     csv_filename = dir + lista[0][1] + "/opzioni/" + data + '.csv'
 elif server == 'local':
     directory = dir + lista[0][1] + "\\opzioni\\"
     csv_filename = dir + lista[0][1] + "\\opzioni\\" + data + '.csv'
 if not os.path.exists(directory):
     os.makedirs(directory)
 with open(csv_filename, 'w', newline="") as f: