/
scrapePinterest.py
72 lines (57 loc) · 2.52 KB
/
scrapePinterest.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
from __future__ import print_function
from scrapy.selector import Selector
import datetime
import scrapy
import time
import sys
import os
import re
sys.path.append("_lib/")
from CsvHelper import CsvHelper
from UrlHelper import UrlHelper
import constants
__author__ = 'Jacek Aleksander Gruca'
# The field names of the output CSV file.
FIELD_NAMES = ['pinterest_handle', 'pinterest_url', 'timestamp', 'number_of_followers']
PREFIX = 'https://www.pinterest.com/'
# This class represents the spider which will be run by Scrapy to scrape the number of followers from Pinterest pages.
class PinterestScraper(scrapy.Spider):
name = 'Pinterest Scraper'
# This variable is used by Scrapy to begin crawling.
start_urls = []
# This dictionary holds the mapping of the URLs to Pinterest handles, which is used when populating the output file.
url_map = {}
# This list will contain all the URLs to visit and will pass them onto Scrapy in order, one by one.
urls_to_visit = []
# This method is the constructor of the spider-scraper. It takes in the names of the input and output files
# and performs some pre-processing.
def __init__(self, input_file=None, output_file=None):
self.csv_helper = CsvHelper(FIELD_NAMES, input_file, output_file)
if self.csv_helper.stop:
print("\nINCORRECT INVOCATION, run as:\nscrapy runspider %s" % os.path.basename(__file__) + \
" -a input_file=<your input file> -a output_file=<your output file>\n")
return
self.url_helper = UrlHelper(PREFIX)
self.url_helper.process_urls_for_scrapy(self.csv_helper.get_input_file_content(),
self.start_urls, self.url_map, self.urls_to_visit)
def make_requests_from_url(self, url):
return UrlHelper.make_requests_from_url(url)
def parse(self, response):
# This method parses each of the pages found under the urls_to_visit and extracts the number
# of followers from each of them
p = re.compile('.*"pinterestapp:followers"\s*:\s*"(\d+)"')
body = response.body_as_unicode().split('\n')
followerCount = None
for line in body:
m = p.match(line)
if m:
followerCount = m.group(1)
self.csv_helper.write_row_to_output_file(
FIELD_NAMES,
{FIELD_NAMES[0]: self.url_map[response.meta['start_url']], \
FIELD_NAMES[1]: response.meta['start_url'], \
FIELD_NAMES[2]: datetime.datetime.fromtimestamp(time.time()).strftime(constants.TIME_FORMAT), \
FIELD_NAMES[3]: followerCount})
# If there are still URLs to process, then yield more crawling.
if self.urls_to_visit:
yield self.make_requests_from_url(self.urls_to_visit.pop(0))