class iTunesScraper(scrapy.Spider): name = 'iTunes Scraper' # This variable is used by Scrapy to begin crawling. start_urls = [] # This list will contain all the URLs to visit and will pass them onto Scrapy in order, one by one. urls_to_visit = [] # This method is the constructor of the spider-scraper. It takes in the names of the input and output files # and performs some pre-processing. def __init__(self, input_file=None, output_file=None): self.csv_helper = CsvHelper(FIELD_NAMES, input_file, output_file) if self.csv_helper.stop: print("\nINCORRECT INVOCATION, run as:\nscrapy runspider %s" % os.path.basename(__file__) + \ " -a input_file=<your input file> -a output_file=<your output file>\n") return self.url_helper = UrlHelper("") # no prefix self.url_helper.process_urls_for_scrapy(self.csv_helper.get_input_file_content(), self.start_urls, None, self.urls_to_visit) def make_requests_from_url(self, url): return UrlHelper.make_requests_from_url(url) def parse(self, response): fullStars = len(response.xpath("//div[@id='left-stack']/div[2]/div[2]/div[1]/span[@class='rating-star']")) halfStars = len(response.xpath("//div[@id='left-stack']/div[2]/div[2]/div[1]/span[@class='rating-star half']")) ghostStars = len(response.xpath("//div[@id='left-stack']/div[2]/div[2]/div[1]/span[@class='rating-star ghost']")) reviewCount = response.xpath("//div[@id='left-stack']/div[2]/div[2]/span[2]/text()").extract_first() reviewCount = reviewCount.strip()[:-8] fullStarsAll = len(response.xpath("//div[@id='left-stack']/div[2]/div[4]/div[1]/span[@class='rating-star']")) halfStarsAll = len(response.xpath("//div[@id='left-stack']/div[2]/div[4]/div[1]/span[@class='rating-star half']")) ghostStarsAll = len( response.xpath("//div[@id='left-stack']/div[2]/div[4]/div[1]/span[@class='rating-star ghost']")) reviewCountAll = response.xpath("//div[@id='left-stack']/div[2]/div[4]/span[1]/text()").extract_first() reviewCountAll = reviewCountAll.strip()[:-8] message = None if fullStars + halfStars + ghostStars != 5 or fullStarsAll + halfStarsAll + ghostStarsAll != 5: message = "Error scraping page, scraping skipped." self.csv_helper.write_row_to_output_file( FIELD_NAMES, {FIELD_NAMES[0]: response.meta['start_url'], \ FIELD_NAMES[1]: datetime.datetime.fromtimestamp(time.time()).strftime(constants.TIME_FORMAT), \ FIELD_NAMES[2]: fullStars + 0.5 * halfStars if not message else message, FIELD_NAMES[3]: reviewCount if not message else None, FIELD_NAMES[4]: fullStarsAll + 0.5 * halfStarsAll if not message else None, FIELD_NAMES[5]: reviewCountAll if not message else None}) # If there are still URLs to process, then yield more crawling. if self.urls_to_visit: yield self.make_requests_from_url(self.urls_to_visit.pop(0))
def on_start(): global user_list global csv_helper global data_helper data_helper = DataHelper() csv_helper = CsvHelper() user_list_file_name = "../Resources/UserDetails.csv" user_list = csv_helper.get_csv_values(user_list_file_name)
def split(in_dir, out_dir, csv): if not os.path.exists(out_dir): os.makedirs(out_dir) print "created folder :", out_dir csv_helper = CsvHelper() csv_helper.set_csv(csv) print "[INFO] using csv file : ", csv image_paths_df = csv_helper.get_single_column(IMAGE_PATH) copy_images(image_paths_df, in_dir, out_dir)
class PinterestScraper(scrapy.Spider): name = "Pinterest Scraper" # This variable is used by Scrapy to begin crawling. start_urls = [] # This dictionary holds the mapping of the URLs to Pinterest handles, which is used when populating the output file. url_map = {} # This list will contain all the URLs to visit and will pass them onto Scrapy in order, one by one. urls_to_visit = [] # This method is the constructor of the spider-scraper. It takes in the names of the input and output files # and performs some pre-processing. def __init__(self, input_file=None, output_file=None): self.csv_helper = CsvHelper(FIELD_NAMES, input_file, output_file) if self.csv_helper.stop: print( "\nINCORRECT INVOCATION, run as:\nscrapy runspider %s" % os.path.basename(__file__) + " -a input_file=<your input file> -a output_file=<your output file>\n" ) return self.url_helper = UrlHelper(PREFIX) self.url_helper.process_urls_for_scrapy( self.csv_helper.get_input_file_content(), self.start_urls, self.url_map, self.urls_to_visit ) def make_requests_from_url(self, url): return UrlHelper.make_requests_from_url(url) def parse(self, response): # This method parses each of the pages found under the urls_to_visit and extracts the number # of followers from each of them p = re.compile('.*"pinterestapp:followers"\s*:\s*"(\d+)"') body = response.body_as_unicode().split("\n") followerCount = None for line in body: m = p.match(line) if m: followerCount = m.group(1) self.csv_helper.write_row_to_output_file( FIELD_NAMES, { FIELD_NAMES[0]: self.url_map[response.meta["start_url"]], FIELD_NAMES[1]: response.meta["start_url"], FIELD_NAMES[2]: datetime.datetime.fromtimestamp(time.time()).strftime(constants.TIME_FORMAT), FIELD_NAMES[3]: followerCount, }, ) # If there are still URLs to process, then yield more crawling. if self.urls_to_visit: yield self.make_requests_from_url(self.urls_to_visit.pop(0))
class FacebookScraper(scrapy.Spider): name = 'Facebook Scraper' # This variable is used by Scrapy to begin crawling. start_urls = [] # This dictionary holds the mapping of the URLs to Facebook handles, which is used when populating the output file. url_map = {} # This list will contain all the URLs to visit and will pass them onto Scrapy in order, one by one. urls_to_visit = [] # This method is the constructor of the spider-scraper. It takes in the names of the input and output files # and performs some pre-processing. def __init__(self, input_file=None, output_file=None): self.csv_helper = CsvHelper(FIELD_NAMES, input_file, output_file) if self.csv_helper.stop: print("\nINCORRECT INVOCATION, run as:\nscrapy runspider %s" % os.path.basename(__file__) + \ " -a input_file=<your input file> -a output_file=<your output file>\n") return self.url_helper = UrlHelper(PREFIX) self.url_helper.process_urls_for_scrapy(self.csv_helper.get_input_file_content(), self.start_urls, self.url_map, self.urls_to_visit) def make_requests_from_url(self, url): return UrlHelper.make_requests_from_url(url) def parse(self, response): # Here we're in the method that performs the scraping. Below an xpath expression extracts all HTML comments # (it just so happens that the number of likes is in a comment) # from the Facebook page and narrows it down to the div containing the number of likes. comment = response.xpath('//comment()').re(r'<div.*%s.*/div>' % LIKES_ELEMENT_NAME) # Convert the text in the comment to HTML DOM object. comment_sel = Selector(text=comment[0], type="html") # Use XPATH to extract the final text with the number of likes. likes_count = (comment_sel.xpath('//*[@id="%s"]/*/text()' \ % LIKES_ELEMENT_NAME).extract()[0]).replace(',', '').strip() self.csv_helper.write_row_to_output_file( FIELD_NAMES, {FIELD_NAMES[0]: self.url_map[response.meta['start_url']], \ FIELD_NAMES[1]: response.meta['start_url'], \ FIELD_NAMES[2]: datetime.datetime.fromtimestamp(time.time()).strftime(constants.TIME_FORMAT), \ FIELD_NAMES[3]: likes_count}) # If there are still URLs to process, then yield more crawling. if self.urls_to_visit: yield self.make_requests_from_url(self.urls_to_visit.pop(0))
def run_display(lidar_points_csv, flight_path_csv): """ This function is used to display each sweep one by one. This function first read_lidar_element() and read_flight_element() functions from CsvHelper class to read the lidar's point and flight path points from given csv file. After that it calls display_sweep function from DisplayUtil class to display each sweep one by one. @param lidar_points_csv: Given file name of lidar's point. @param flight_path_csv: Given file name of flight path points. """ csv_helper = CsvHelper(lidar_points_csv, flight_path_csv) lidar_points = csv_helper.read_lidar_element() flight_path = csv_helper.read_flight_path_element() display_helper = DisplayUtil(lidar_points, flight_path) display_helper.display_sweep()
def do_predict(label, category, csvs_folder_path): model_path = Utilities.construct_filepath(in_dir, [category, label], ".model") text_clf_svm = pickle.load(open(model_path, "rb")) # read in test csv data test_csv_path = csvs_folder_path + category + const.TEST csv_helper = CsvHelper() csv_helper.set_csv(test_csv_path) csv_helper.add_column( label ) # create new column with title label (set on top) so that we can fil lin later test_df = csv_helper.get_csv() # Do prediction data_df = test_df["title"] id_df = test_df["itemid"] predicted_multiclass = text_clf_svm.predict_proba(data_df) pred_top2 = get_top_2_dict(text_clf_svm.classes_, predicted_multiclass, id_df) # write back to csv write_to_csv(pred_top2, test_df, test_csv_path ) # top 2 is the top 2 predicted test_df is the original csv
class ChromeScraper(scrapy.Spider): name = 'Chrome Scraper' # This variable is used by Scrapy to begin crawling. start_urls = [] # This list will contain all the URLs to visit and will pass them onto Scrapy in order, one by one. urls_to_visit = [] # This method is the constructor of the spider-scraper. It takes in the names of the input and output files # and performs some pre-processing. def __init__(self, input_file=None, output_file=None): self.csv_helper = CsvHelper(FIELD_NAMES, input_file, output_file) if self.csv_helper.stop: print("\nINCORRECT INVOCATION, run as:\nscrapy runspider %s" % os.path.basename(__file__) + \ " -a input_file=<your input file> -a output_file=<your output file>\n") return self.url_helper = UrlHelper("") # no prefix self.url_helper.process_urls_for_scrapy(self.csv_helper.get_input_file_content(), self.start_urls, None, self.urls_to_visit) def make_requests_from_url(self, url): return UrlHelper.make_requests_from_url(url) def parse(self, response): # This method parses each of the pages found under the urls_to_visit and extracts the number # of users from each of them p = re.compile('.*name\s*=\s*"user_count"\s*>\s*(\d+)\s*<') body = response.body_as_unicode().split('\n') userCount = None for line in body: m = p.match(line) if m: userCount = m.group(1) self.csv_helper.write_row_to_output_file( FIELD_NAMES, {FIELD_NAMES[0]: response.meta['start_url'], \ FIELD_NAMES[1]: datetime.datetime.fromtimestamp(time.time()).strftime(constants.TIME_FORMAT), \ FIELD_NAMES[2]: userCount}) # If there are still URLs to process, then yield more crawling. if self.urls_to_visit: yield self.make_requests_from_url(self.urls_to_visit.pop(0))
def __init__(self, input_file=None, output_file=None): self.csv_helper = CsvHelper(FIELD_NAMES, input_file, output_file) if self.csv_helper.stop: print("\nINCORRECT INVOCATION, run as:\nscrapy runspider %s" % os.path.basename(__file__) + \ " -a input_file=<your input file> -a output_file=<your output file>\n") return self.url_helper = UrlHelper("") # no prefix self.url_helper.process_urls_for_scrapy(self.csv_helper.get_input_file_content(), self.start_urls, None, self.urls_to_visit)
class TwitterScraper(scrapy.Spider): name = 'Twitter Scraper' # This variable is used by Scrapy to begin crawling. start_urls = [] # This dictionary holds the mapping of the URLs to Twitter handles, which is used when populating the output file. url_map = {} # This list will contain all the URLs to visit and will pass them onto Scrapy in order, one by one. urls_to_visit = [] # This method is the constructor of the spider-scraper. It takes in the names of the input and output files # and performs some pre-processing. def __init__(self, input_file=None, output_file=None): self.csv_helper = CsvHelper(FIELD_NAMES, input_file, output_file) if self.csv_helper.stop: print("\nINCORRECT INVOCATION, run as:\nscrapy runspider %s" % os.path.basename(__file__) + \ " -a input_file=<your input file> -a output_file=<your output file>\n") return self.urlHelper = UrlHelper(PREFIX) self.urlHelper.process_urls_for_scrapy(self.csv_helper.get_input_file_content(), self.start_urls, self.url_map, self.urls_to_visit) # Here we override the method make_requests_from_url to use the one from the UrlHelper instead of the one in # scrapy.Spider def make_requests_from_url(self, url): return UrlHelper.make_requests_from_url(url) def parse(self, response): # Here we're in the method that performs the scraping. Below an xpath expression extracts the # number of followers from the element with attribute data-nav equal to "followers" followers_count = response.xpath('//*[@data-nav="followers"]/@title').re("[\d,]*")[0].replace(',', '') self.csv_helper.write_row_to_output_file( FIELD_NAMES, {FIELD_NAMES[0]: self.url_map[response.meta['start_url']], \ FIELD_NAMES[1]: response.meta['start_url'], \ FIELD_NAMES[2]: datetime.datetime.fromtimestamp(time.time()).strftime(constants.TIME_FORMAT), \ FIELD_NAMES[3]: followers_count}) # If there are still URLs to process, then yield more crawling. if self.urls_to_visit: yield self.make_requests_from_url(self.urls_to_visit.pop(0))
def test_optimization(lidar_points_file_name, flight_path_file_name): """ This function is used to display each sweep one by one of the optimized flight path. This function first read_lidar_element() and read_flight_element() functions from CsvHelper class to read the lidar's point and flight path points from given csv file. Then it calls optimization function from Optimization class to optimize the flight path. After that it calls display_sweep function from DisplayUtil class to display each sweep one by one of the optimized flight path. @param lidar_points_file_name: Given file name of lidar's point. @param flight_path_file_name: Given file name of flight path points. """ csv_helper = CsvHelper(lidar_points_file_name, flight_path_file_name) flight_path = csv_helper.read_flight_path_element() lidar_points = csv_helper.read_lidar_element() if len(flight_path) != len(lidar_points): print("Wrong given data! Number of sweep in booth file is not equal") else: optimization = Optimization(lidar_points, flight_path) optimized_lidar_points, optimized_flight_path = optimization.optimization( ) display_helper = DisplayUtil(optimized_lidar_points, optimized_flight_path) display_helper.display_sweep()
def run_flight_optimization(lidar_points_csv, flight_path_csv, output_flight_path_csv): """ This function is used to optimize the flight path. This function first read_lidar_element() and read_flight_element() functions from CsvHelper class to read the lidar's point and flight path points from given csv file. After that it calls optimization function from Optimization class. Finally it calls write_flight_path functions from CsvHelper class to write optimized flight path in the given file name. @param lidar_points_csv: Given file name of lidar's point. @param flight_path_csv: Given file name of flight path points. @param output_flight_path_csv: Given file name to write optimized flight path. """ csv_helper = CsvHelper(lidar_points_csv, flight_path_csv) lidar_points = csv_helper.read_lidar_element() flight_path = csv_helper.read_flight_path_element() optimization = Optimization(lidar_points, flight_path) optimized_lidar_points, optimized_flight_path = optimization.optimization() csv_helper.write_flight_path(optimized_flight_path, output_flight_path_csv)
losses[ item] = "categorical_crossentropy" # specifies the weight per loss lossWeights[item] = 1.0 return losses, lossWeights def gen_y_parameter_for_fit(wanted_categories_output, labels): y_dict = {} for i in range(0, len(wanted_categories_output)): y_dict[wanted_categories_output[i]] = labels[i] return y_dict if __name__ == "__main__": image_dir, out_dir, category, csv_path = process_arg() csv_helper = CsvHelper() label_header_list = get_label_headers(csv_helper, csv_path, category + "_image") data, all_labels = set_data_labels(image_dir, csv_helper) wanted_categories = get_wated_categories(category) wanted_categories_lower = gen_name(wanted_categories, to_add="") wanted_categories_output = gen_name(wanted_categories, to_add="_output") wanted_categories_loss = gen_name(wanted_categories, to_add="_output_loss") # selects the labels listed in wanted categories labels_to_use = get_labels_to_use(label_header_list, all_labels, wanted_categories) # binarize the labels binarized_labels_dict = binarizer(
""" if __name__ == "__main__": save_model = False args = Utilities.process_arg("-save") out_dir = "/content/drive/My Drive/models" csvs_folder_path = "/content/drive/My Drive/yy_fashion.csv" category = "fashion" save = "true" if save == "t" or save == "T" or save == "true": save_model = True train_csv_path = "/content/drive/My Drive/yy_fashion.csv" csv_helper = CsvHelper() csv_helper.set_csv(train_csv_path) # set csv file as train_csv_path print("Set csv done") ''' This part is for mass training like for each attribute/ label just train a model for individual model tuning, set labels with a single label , like like 123 ''' csv_helper.set_all_headers_as_label( ) # set all the labels as label headers labels = csv_helper.get_label_headers( ) # get lables i.e all column name besides image name, itemid and title #labels = ["Colour_group"] # just doing print("Predicting These classes : ", labels) for label in labels: do_training(label, csv_helper, save_model)
def print_usage(): print("Run as:\npython %s <input file> <output file>" % os.path.basename(__file__)) if len(sys.argv) != 3: print_usage() exit(0) # This dictionary holds the mapping of the URLs to Twitter phrases, which is used when populating the output file. url_map = {} # This list will contain all the URLs to visit. urls_to_visit = [] # Create CsvHelper which will aid in processing the CSV files csv_helper = CsvHelper(FIELD_NAMES, sys.argv[1], sys.argv[2]) if csv_helper.stop: print_usage() exit(0) # Here we take the system's current time and convert it to the number of seconds since the 'epoch'. # Further on we subtract the number of seconds in 24 hours from it and specify the result # as the time from which the number of tweets should be counted. current_time = time.mktime(time.localtime()) time_24_hours_ago = current_time - constants.SECONDS_IN_24H_COUNT time_query_parameter = "&mintime=" + repr(int(time_24_hours_ago)) # Create the UrlHelper which will aid in processing URLs url_helper = UrlHelper( 'http://otter.topsy.com/searchcount.js?dynamic=1&count_method=citation&' + 'apikey=09C43A9B270A470B8EB8F2946A9369F3&%s&q=' % time_query_parameter)