def generate_constant_csvs(level="Family Name"): if not file_utils.folder_exists("top_category_files/"): generate_top_category_files(level) else: print("top_category_files/ directory already exists") if not file_utils.file_exists("top_category_strings.csv"): generate_top_category_string_csv() else: print("top_category_strings.csv file already exists") if not file_utils.file_exists("brand_counts.csv"): generate_brand_counts_csv() else: print("brand_counts.csv file already exists")
def __download_image_file(self): """ Downloads the image file :return: the image file object """ if not file_utils.file_exists(self.image_file_path): logger.info('Downloading Image from - ' + self.image_url) return file_utils.download(self.image_url, self.download_path)
def testFileExists(self): """ Ensure the file_utils.fileExists() method returns false with a directory """ if not os.path.exists(self.tmpFile): os.makedirs(self.tmpFile) result = file_utils.file_exists(self.tmpFile) self.assertTrue(result)
def __get_image_file(self): """ Returns the image file reference. If the image file does not exist, download it :return: the image file object """ if file_utils.file_exists(self.image_file_path): return open(self.image_file_path, 'r') else: if not os.path.exists(self.download_path): os.makedirs(self.download_path) logger.info('Found existing image file') return self.__download_image_file()
def match_sites_dataframe(dataframe, matches_json="", top_n=5): ''' Generates a dataframe of matched sites. matches_json is an optional parameter for saving and loading slow to generate description based matches. INPUTS: - dataframe - matches_json -- A string representing the filename of a json file containing old matches to speed up processing - top_n (int) -- Maximum amount of matches to return for each item OUTPUTS: - matches_df ''' #Missing values should be represented by empty strings dataframe = dataframe.fillna(value="") #Ensure we have the correct columns dataframe = pandas.DataFrame(dataframe.to_dict("records"), columns=ALL_FIELDNAMES) #Fill any columns we just added with "-1" to mark it wasn't originally there dataframe = dataframe.fillna(value="-1") #Make sure everything in that dataframe is a string dataframe = dataframe.applymap(lambda x: str(x)) #Remove extra whitespace dataframe = dataframe.applymap(lambda x: x.strip() if type(x) == str else x) if "Match Site" in dataframe.columns: ndf = dataframe[dataframe["Match Site"] == "-1"] if ndf.empty: #No new rows. return pandas.DataFrame() odf = dataframe[dataframe["Match Site"] != "-1"] if odf.empty: old_rows = [] else: old_rows = odf.to_dict("records") new_rows = ndf.to_dict("records") else: new_rows = dataframe.to_dict("records") old_rows = [] # Add a 'Description' field to new_rows site_rows = [{ **row, "Description": row["Stock Description"] } for row in new_rows] old_site_rows = remove_duplicate_rows(old_rows) old_item_ids_to_rows = generate_item_ids_to_rows(old_rows) # Generate desc_matches based on matches_json desc_matches = {} if matches_json: if file_utils.file_exists(matches_json): desc_matches = file_utils.read_json(matches_json) else: desc_matches = match_by_description(site_rows, old_site_rows) file_utils.save_json(matches_json, desc_matches) matches_rows = match_sites(site_rows, old_site_rows, old_item_ids_to_rows, desc_matches, top_n=top_n) matches_df = pandas.DataFrame(matches_rows, columns=OUTPUT_FIELDNAMES) matches_df = matches_df.fillna(value="") matches_df = matches_df[OUTPUT_FIELDNAMES] return matches_df
"Maximum amount of matches to return for each row. Default value is 5.", type=int, default=5) args = parser.parse_args() sites_rows = file_utils.read_csv(args.filename) output_file = args.output matches_json = args.match_data if not matches_json: matches_json = "" top_n = args.matches stime = time.time() if file_utils.file_exists(output_file): old_rows = file_utils.read_csv(output_file) else: old_rows = [] ndf = pandas.DataFrame(sites_rows) odf = pandas.DataFrame(old_rows) all_columns = ndf.columns.union(odf.columns) ndf = ndf.reindex(columns=all_columns, fill_value="-1") odf = odf.reindex(columns=all_columns, fill_value="-1") df = pandas.concat([ndf, odf]).reset_index(drop=True) if output_file: matches_df = match_sites_dataframe(df, matches_json=matches_json, top_n=top_n)
def testFileNotExist(self): """ Ensure the file_utils.fileExists() method returns false with a bogus file """ result = file_utils.file_exists('/foo/bar.txt') self.assertFalse(result)
def testFileIsDirectory(self): """ Ensure the file_utils.fileExists() method returns false with a directory """ result = file_utils.file_exists(self.tmpDir) self.assertFalse(result)