Example #1
0
class iTunesScraper(scrapy.Spider):
	name = 'iTunes Scraper'

	# This variable is used by Scrapy to begin crawling.
	start_urls = []

	# This list will contain all the URLs to visit and will pass them onto Scrapy in order, one by one.
	urls_to_visit = []

	# This method is the constructor of the spider-scraper. It takes in the names of the input and output files
	# and performs some pre-processing.
	def __init__(self, input_file=None, output_file=None):
		self.csv_helper = CsvHelper(FIELD_NAMES, input_file, output_file)
		if self.csv_helper.stop:
			print("\nINCORRECT INVOCATION, run as:\nscrapy runspider %s" % os.path.basename(__file__) + \
					" -a input_file=<your input file> -a output_file=<your output file>\n")
			return
		self.url_helper = UrlHelper("")  # no prefix
		self.url_helper.process_urls_for_scrapy(self.csv_helper.get_input_file_content(),
															 self.start_urls, None, self.urls_to_visit)

	def make_requests_from_url(self, url):
		return UrlHelper.make_requests_from_url(url)

	def parse(self, response):

		fullStars = len(response.xpath("//div[@id='left-stack']/div[2]/div[2]/div[1]/span[@class='rating-star']"))
		halfStars = len(response.xpath("//div[@id='left-stack']/div[2]/div[2]/div[1]/span[@class='rating-star half']"))
		ghostStars = len(response.xpath("//div[@id='left-stack']/div[2]/div[2]/div[1]/span[@class='rating-star ghost']"))

		reviewCount = response.xpath("//div[@id='left-stack']/div[2]/div[2]/span[2]/text()").extract_first()
		reviewCount = reviewCount.strip()[:-8]

		fullStarsAll = len(response.xpath("//div[@id='left-stack']/div[2]/div[4]/div[1]/span[@class='rating-star']"))
		halfStarsAll = len(response.xpath("//div[@id='left-stack']/div[2]/div[4]/div[1]/span[@class='rating-star half']"))
		ghostStarsAll = len(
			response.xpath("//div[@id='left-stack']/div[2]/div[4]/div[1]/span[@class='rating-star ghost']"))

		reviewCountAll = response.xpath("//div[@id='left-stack']/div[2]/div[4]/span[1]/text()").extract_first()
		reviewCountAll = reviewCountAll.strip()[:-8]

		message = None
		if fullStars + halfStars + ghostStars != 5 or fullStarsAll + halfStarsAll + ghostStarsAll != 5:
			message = "Error scraping page, scraping skipped."

		self.csv_helper.write_row_to_output_file(
			FIELD_NAMES,
			{FIELD_NAMES[0]: response.meta['start_url'], \
			 FIELD_NAMES[1]: datetime.datetime.fromtimestamp(time.time()).strftime(constants.TIME_FORMAT), \
			 FIELD_NAMES[2]: fullStars + 0.5 * halfStars if not message else message,
			 FIELD_NAMES[3]: reviewCount if not message else None,
			 FIELD_NAMES[4]: fullStarsAll + 0.5 * halfStarsAll if not message else None,
			 FIELD_NAMES[5]: reviewCountAll if not message else None})

		# If there are still URLs to process, then yield more crawling.
		if self.urls_to_visit:
			yield self.make_requests_from_url(self.urls_to_visit.pop(0))
 def on_start():
     global user_list
     global csv_helper
     global data_helper
     
     data_helper = DataHelper()
     csv_helper = CsvHelper()
     user_list_file_name = "../Resources/UserDetails.csv"
     user_list =  csv_helper.get_csv_values(user_list_file_name)
Example #3
0
def split(in_dir, out_dir, csv):
    if not os.path.exists(out_dir):
        os.makedirs(out_dir)
        print "created folder :", out_dir
    csv_helper = CsvHelper()
    csv_helper.set_csv(csv)
    print "[INFO] using csv file : ", csv
    image_paths_df = csv_helper.get_single_column(IMAGE_PATH)
    copy_images(image_paths_df, in_dir, out_dir)
class PinterestScraper(scrapy.Spider):
    name = "Pinterest Scraper"

    # This variable is used by Scrapy to begin crawling.
    start_urls = []

    # This dictionary holds the mapping of the URLs to Pinterest handles, which is used when populating the output file.
    url_map = {}

    # This list will contain all the URLs to visit and will pass them onto Scrapy in order, one by one.
    urls_to_visit = []

    # This method is the constructor of the spider-scraper. It takes in the names of the input and output files
    # and performs some pre-processing.
    def __init__(self, input_file=None, output_file=None):
        self.csv_helper = CsvHelper(FIELD_NAMES, input_file, output_file)
        if self.csv_helper.stop:
            print(
                "\nINCORRECT INVOCATION, run as:\nscrapy runspider %s" % os.path.basename(__file__)
                + " -a input_file=<your input file> -a output_file=<your output file>\n"
            )
            return
        self.url_helper = UrlHelper(PREFIX)
        self.url_helper.process_urls_for_scrapy(
            self.csv_helper.get_input_file_content(), self.start_urls, self.url_map, self.urls_to_visit
        )

    def make_requests_from_url(self, url):
        return UrlHelper.make_requests_from_url(url)

    def parse(self, response):
        # This method parses each of the pages found under the urls_to_visit and extracts the number
        # of followers from each of them
        p = re.compile('.*"pinterestapp:followers"\s*:\s*"(\d+)"')
        body = response.body_as_unicode().split("\n")

        followerCount = None
        for line in body:
            m = p.match(line)
            if m:
                followerCount = m.group(1)

        self.csv_helper.write_row_to_output_file(
            FIELD_NAMES,
            {
                FIELD_NAMES[0]: self.url_map[response.meta["start_url"]],
                FIELD_NAMES[1]: response.meta["start_url"],
                FIELD_NAMES[2]: datetime.datetime.fromtimestamp(time.time()).strftime(constants.TIME_FORMAT),
                FIELD_NAMES[3]: followerCount,
            },
        )

        # If there are still URLs to process, then yield more crawling.
        if self.urls_to_visit:
            yield self.make_requests_from_url(self.urls_to_visit.pop(0))
class FacebookScraper(scrapy.Spider):
	name = 'Facebook Scraper'

	# This variable is used by Scrapy to begin crawling.
	start_urls = []

	# This dictionary holds the mapping of the URLs to Facebook handles, which is used when populating the output file.
	url_map = {}

	# This list will contain all the URLs to visit and will pass them onto Scrapy in order, one by one.
	urls_to_visit = []

	# This method is the constructor of the spider-scraper. It takes in the names of the input and output files
	# and performs some pre-processing.
	def __init__(self, input_file=None, output_file=None):
		self.csv_helper = CsvHelper(FIELD_NAMES, input_file, output_file)
		if self.csv_helper.stop:
			print("\nINCORRECT INVOCATION, run as:\nscrapy runspider %s" % os.path.basename(__file__) + \
					" -a input_file=<your input file> -a output_file=<your output file>\n")
			return
		self.url_helper = UrlHelper(PREFIX)
		self.url_helper.process_urls_for_scrapy(self.csv_helper.get_input_file_content(),
															 self.start_urls, self.url_map, self.urls_to_visit)

	def make_requests_from_url(self, url):
		return UrlHelper.make_requests_from_url(url)

	def parse(self, response):
		# Here we're in the method that performs the scraping. Below an xpath expression extracts all HTML comments
		# (it just so happens that the number of likes is in a comment)
		# from the Facebook page and narrows it down to the div containing the number of likes.
		comment = response.xpath('//comment()').re(r'<div.*%s.*/div>' % LIKES_ELEMENT_NAME)

		# Convert the text in the comment to HTML DOM object.
		comment_sel = Selector(text=comment[0], type="html")

		# Use XPATH to extract the final text with the number of likes.
		likes_count = (comment_sel.xpath('//*[@id="%s"]/*/text()' \
													% LIKES_ELEMENT_NAME).extract()[0]).replace(',', '').strip()

		self.csv_helper.write_row_to_output_file(
			FIELD_NAMES,
			{FIELD_NAMES[0]: self.url_map[response.meta['start_url']], \
			 FIELD_NAMES[1]: response.meta['start_url'], \
			 FIELD_NAMES[2]: datetime.datetime.fromtimestamp(time.time()).strftime(constants.TIME_FORMAT), \
			 FIELD_NAMES[3]: likes_count})

		# If there are still URLs to process, then yield more crawling.
		if self.urls_to_visit:
			yield self.make_requests_from_url(self.urls_to_visit.pop(0))
Example #6
0
def run_display(lidar_points_csv, flight_path_csv):
    """
    This function is used to display each sweep one by one.
    This function first read_lidar_element() and read_flight_element()
    functions from CsvHelper class to read the lidar's point and flight
    path points from given csv file. After that it calls display_sweep
    function from DisplayUtil class to display each sweep one by one.

    @param lidar_points_csv: Given file name of lidar's point.
    @param flight_path_csv:  Given file name of flight path points.
    """
    csv_helper = CsvHelper(lidar_points_csv, flight_path_csv)
    lidar_points = csv_helper.read_lidar_element()
    flight_path = csv_helper.read_flight_path_element()
    display_helper = DisplayUtil(lidar_points, flight_path)
    display_helper.display_sweep()
Example #7
0
def do_predict(label, category, csvs_folder_path):

    model_path = Utilities.construct_filepath(in_dir, [category, label],
                                              ".model")
    text_clf_svm = pickle.load(open(model_path, "rb"))

    # read in test csv data
    test_csv_path = csvs_folder_path + category + const.TEST
    csv_helper = CsvHelper()
    csv_helper.set_csv(test_csv_path)
    csv_helper.add_column(
        label
    )  # create new column with title label (set on top) so that we can fil lin later
    test_df = csv_helper.get_csv()

    # Do prediction
    data_df = test_df["title"]
    id_df = test_df["itemid"]
    predicted_multiclass = text_clf_svm.predict_proba(data_df)
    pred_top2 = get_top_2_dict(text_clf_svm.classes_, predicted_multiclass,
                               id_df)

    # write back to csv
    write_to_csv(pred_top2, test_df, test_csv_path
                 )  # top 2 is the top 2 predicted test_df is the original csv
class ChromeScraper(scrapy.Spider):
	name = 'Chrome Scraper'

	# This variable is used by Scrapy to begin crawling.
	start_urls = []

	# This list will contain all the URLs to visit and will pass them onto Scrapy in order, one by one.
	urls_to_visit = []

	# This method is the constructor of the spider-scraper. It takes in the names of the input and output files
	# and performs some pre-processing.
	def __init__(self, input_file=None, output_file=None):
		self.csv_helper = CsvHelper(FIELD_NAMES, input_file, output_file)
		if self.csv_helper.stop:
			print("\nINCORRECT INVOCATION, run as:\nscrapy runspider %s" % os.path.basename(__file__) + \
					" -a input_file=<your input file> -a output_file=<your output file>\n")
			return
		self.url_helper = UrlHelper("") # no prefix
		self.url_helper.process_urls_for_scrapy(self.csv_helper.get_input_file_content(),
															 self.start_urls, None, self.urls_to_visit)

	def make_requests_from_url(self, url):
		return UrlHelper.make_requests_from_url(url)

	def parse(self, response):
		# This method parses each of the pages found under the urls_to_visit and extracts the number
		# of users from each of them
		p = re.compile('.*name\s*=\s*"user_count"\s*>\s*(\d+)\s*<')
		body = response.body_as_unicode().split('\n')

		userCount = None
		for line in body:
			m = p.match(line)
			if m:
				userCount = m.group(1)

		self.csv_helper.write_row_to_output_file(
			FIELD_NAMES,
			{FIELD_NAMES[0]: response.meta['start_url'], \
			 FIELD_NAMES[1]: datetime.datetime.fromtimestamp(time.time()).strftime(constants.TIME_FORMAT), \
			 FIELD_NAMES[2]: userCount})

		# If there are still URLs to process, then yield more crawling.
		if self.urls_to_visit:
			yield self.make_requests_from_url(self.urls_to_visit.pop(0))
Example #9
0
	def __init__(self, input_file=None, output_file=None):
		self.csv_helper = CsvHelper(FIELD_NAMES, input_file, output_file)
		if self.csv_helper.stop:
			print("\nINCORRECT INVOCATION, run as:\nscrapy runspider %s" % os.path.basename(__file__) + \
					" -a input_file=<your input file> -a output_file=<your output file>\n")
			return
		self.url_helper = UrlHelper("")  # no prefix
		self.url_helper.process_urls_for_scrapy(self.csv_helper.get_input_file_content(),
															 self.start_urls, None, self.urls_to_visit)
class TwitterScraper(scrapy.Spider):
	name = 'Twitter Scraper'

	# This variable is used by Scrapy to begin crawling.
	start_urls = []

	# This dictionary holds the mapping of the URLs to Twitter handles, which is used when populating the output file.
	url_map = {}

	# This list will contain all the URLs to visit and will pass them onto Scrapy in order, one by one.
	urls_to_visit = []

	# This method is the constructor of the spider-scraper. It takes in the names of the input and output files
	# and performs some pre-processing.
	def __init__(self, input_file=None, output_file=None):
		self.csv_helper = CsvHelper(FIELD_NAMES, input_file, output_file)
		if self.csv_helper.stop:
			print("\nINCORRECT INVOCATION, run as:\nscrapy runspider %s" % os.path.basename(__file__) + \
					" -a input_file=<your input file> -a output_file=<your output file>\n")
			return
		self.urlHelper = UrlHelper(PREFIX)
		self.urlHelper.process_urls_for_scrapy(self.csv_helper.get_input_file_content(),
															self.start_urls, self.url_map, self.urls_to_visit)

	# Here we override the method make_requests_from_url to use the one from the UrlHelper instead of the one in
	# scrapy.Spider
	def make_requests_from_url(self, url):
		return UrlHelper.make_requests_from_url(url)

	def parse(self, response):
		# Here we're in the method that performs the scraping. Below an xpath expression extracts the
		# number of followers from the element with attribute data-nav equal to "followers"
		followers_count = response.xpath('//*[@data-nav="followers"]/@title').re("[\d,]*")[0].replace(',', '')

		self.csv_helper.write_row_to_output_file(
			FIELD_NAMES,
			{FIELD_NAMES[0]: self.url_map[response.meta['start_url']], \
			 FIELD_NAMES[1]: response.meta['start_url'], \
			 FIELD_NAMES[2]: datetime.datetime.fromtimestamp(time.time()).strftime(constants.TIME_FORMAT), \
			 FIELD_NAMES[3]: followers_count})

		# If there are still URLs to process, then yield more crawling.
		if self.urls_to_visit:
			yield self.make_requests_from_url(self.urls_to_visit.pop(0))
Example #11
0
def test_optimization(lidar_points_file_name, flight_path_file_name):
    """
    This function is used to display each sweep one by one of the optimized
    flight path. This function first read_lidar_element() and read_flight_element()
    functions from CsvHelper class to read the lidar's point and flight path points
    from given csv file. Then it calls optimization function from Optimization class
    to optimize the flight path. After that it calls display_sweep function from
    DisplayUtil class to display each sweep one by one of the optimized flight path.

    @param lidar_points_file_name: Given file name of lidar's point.
    @param flight_path_file_name:  Given file name of flight path points.
    """
    csv_helper = CsvHelper(lidar_points_file_name, flight_path_file_name)
    flight_path = csv_helper.read_flight_path_element()
    lidar_points = csv_helper.read_lidar_element()
    if len(flight_path) != len(lidar_points):
        print("Wrong given data! Number of sweep in booth file is not equal")
    else:
        optimization = Optimization(lidar_points, flight_path)
        optimized_lidar_points, optimized_flight_path = optimization.optimization(
        )
        display_helper = DisplayUtil(optimized_lidar_points,
                                     optimized_flight_path)
        display_helper.display_sweep()
Example #12
0
def run_flight_optimization(lidar_points_csv, flight_path_csv,
                            output_flight_path_csv):
    """
    This function is used to optimize the flight path.
    This function first read_lidar_element() and read_flight_element()
    functions from CsvHelper class to read the lidar's point and flight
    path points from given csv file. After that it calls optimization function
    from Optimization class. Finally it calls write_flight_path functions from
    CsvHelper class to write optimized flight path  in the given file name.

    @param lidar_points_csv: Given file name of lidar's point.
    @param flight_path_csv:  Given file name of flight path points.
    @param output_flight_path_csv: Given file name to write optimized
                                   flight path.
    """
    csv_helper = CsvHelper(lidar_points_csv, flight_path_csv)
    lidar_points = csv_helper.read_lidar_element()
    flight_path = csv_helper.read_flight_path_element()

    optimization = Optimization(lidar_points, flight_path)
    optimized_lidar_points, optimized_flight_path = optimization.optimization()
    csv_helper.write_flight_path(optimized_flight_path, output_flight_path_csv)
Example #13
0
        losses[
            item] = "categorical_crossentropy"  # specifies the weight per loss
        lossWeights[item] = 1.0
    return losses, lossWeights


def gen_y_parameter_for_fit(wanted_categories_output, labels):
    y_dict = {}
    for i in range(0, len(wanted_categories_output)):
        y_dict[wanted_categories_output[i]] = labels[i]
    return y_dict


if __name__ == "__main__":
    image_dir, out_dir, category, csv_path = process_arg()
    csv_helper = CsvHelper()
    label_header_list = get_label_headers(csv_helper, csv_path,
                                          category + "_image")
    data, all_labels = set_data_labels(image_dir, csv_helper)

    wanted_categories = get_wated_categories(category)
    wanted_categories_lower = gen_name(wanted_categories, to_add="")
    wanted_categories_output = gen_name(wanted_categories, to_add="_output")
    wanted_categories_loss = gen_name(wanted_categories, to_add="_output_loss")

    # selects the labels listed in wanted categories
    labels_to_use = get_labels_to_use(label_header_list, all_labels,
                                      wanted_categories)

    # binarize the labels
    binarized_labels_dict = binarizer(
Example #14
0
"""

if __name__ == "__main__":

    save_model = False
    args = Utilities.process_arg("-save")
    out_dir = "/content/drive/My Drive/models"
    csvs_folder_path = "/content/drive/My Drive/yy_fashion.csv"
    category = "fashion"
    save = "true"

    if save == "t" or save == "T" or save == "true":
        save_model = True

    train_csv_path = "/content/drive/My Drive/yy_fashion.csv"

    csv_helper = CsvHelper()
    csv_helper.set_csv(train_csv_path)  # set csv file as train_csv_path
    print("Set csv done")
    '''
    This part is for mass training like for each attribute/ label just train a model 
    for individual model tuning, set labels with a single label , like like 123
    '''
    csv_helper.set_all_headers_as_label(
    )  # set all the labels as label headers
    labels = csv_helper.get_label_headers(
    )  # get lables i.e all column name besides image name, itemid and title
    #labels = ["Colour_group"] # just doing
    print("Predicting These classes : ", labels)
    for label in labels:
        do_training(label, csv_helper, save_model)
def print_usage():
	print("Run as:\npython %s <input file> <output file>" % os.path.basename(__file__))

if len(sys.argv) != 3:
	print_usage()
	exit(0)

# This dictionary holds the mapping of the URLs to Twitter phrases, which is used when populating the output file.
url_map = {}

# This list will contain all the URLs to visit.
urls_to_visit = []

# Create CsvHelper which will aid in processing the CSV files
csv_helper = CsvHelper(FIELD_NAMES, sys.argv[1], sys.argv[2])
if csv_helper.stop:
	print_usage()
	exit(0)

# Here we take the system's current time and convert it to the number of seconds since the 'epoch'.
# Further on we subtract the number of seconds in 24 hours from it and specify the result
# as the time from which the number of tweets should be counted.
current_time = time.mktime(time.localtime())
time_24_hours_ago = current_time - constants.SECONDS_IN_24H_COUNT
time_query_parameter = "&mintime=" + repr(int(time_24_hours_ago))

# Create the UrlHelper which will aid in processing URLs
url_helper = UrlHelper(
	'http://otter.topsy.com/searchcount.js?dynamic=1&count_method=citation&' +
	'apikey=09C43A9B270A470B8EB8F2946A9369F3&%s&q=' % time_query_parameter)