def save_to_xlsx(data, filename='', fieldnames=None, optimize=False, open=False, date_insert=True): if not __check_data(data, filename): return None data, fieldnames = __init(data, filename, fieldnames, optimize) newfilename = _get_new_file_name_with_datetime('.xlsx', filename, date_insert) wb = openpyxl.Workbook() ws = wb.active ws.append(fieldnames) i = -1 for i, each in enumerate( data.values() if isinstance(data, dict) else data): line = [] for key in fieldnames: value = each.get(key, '') if not isinstance(value, (int, float)) or len(str(value)) > 10: value = str(value) if not optimize else re.sub( r'\s+', ' ', str(value)).strip() line.append(value) try: ws.append(line) except openpyxl.utils.exceptions.IllegalCharacterError: print(f'save_to_xlsx: IllegalCharacterError: {line}' ) # Todo only for debug ws.append( [x.encode('unicode_escape').decode('utf-8') for x in line]) __view_enhancement(ws) wb.save(newfilename) print(f"{newfilename} / {i + 1} lines saved / ", end='') if open: os.startfile(newfilename) return newfilename
def delete_file(url): file_name = Sw.get_cache_path(url) if os.path.exists(file_name): os.remove(file_name) print(f'=== delete file {url}') else: print(f'=== file not found {url}')
def save_to_csv(data, filename='', fieldnames=None, optimize=False, open=False, date_insert=True, SEP=','): if not __check_data(data, filename): return None QC, NL = '"', '\r\n' # separator, quote char, new line data, fieldnames = __init(data, filename, fieldnames, optimize) if SEP == ',': file_extension = '.csv' if SEP == '\t': file_extension = '.tsv' newfilename = _get_new_file_name_with_datetime(file_extension, filename, date_insert) with codecs.open(newfilename, 'w', encoding='utf-8') as file: file.write(SEP.join([f'{QC}{x}{QC}' for x in fieldnames]) + NL) i = -1 for i, each in enumerate( data.values() if isinstance(data, dict) else data): line = [] for key in fieldnames: value = each.get(key, '') if isinstance(value, float): value = str(value) value = str(value) if not optimize else re.sub( r'\s+', ' ', str(value)).strip() line.append(value.replace('"', '""')) file.write(SEP.join([f'{QC}{x}{QC}' for x in line]) + NL) print(f"{newfilename} / {i + 1} lines saved / ", end='') if open: os.startfile(newfilename) return newfilename
def change_main_column(data, maincolumn): result = {} for each in data.values(): result[each[maincolumn]] = each print( f'change_main_column: {len(data)} lines / {len(result)} loaded / {len(data) - len(result)} lost' ) return result
def _copyfile(url, cache_path, new_path, file_type, path): if os.stat(cache_path).st_size == 0: return None # Todo try to download one more time if 'images' in path and file_type in ['.jpg', '.png', '.gif']: img = PIL.Image.open(cache_path) if (img.size[0] + img.size[1]) < 200: return None print(f'copy file {url}', only_debug=True) pathlib.Path(path).mkdir(parents=True, exist_ok=True) shutil.copyfile(cache_path, new_path) return True
def __generate_fieldnames_optimized(data, fieldnames): new_fieldnames = [] for each in data.values() if isinstance(data, dict) else data: if not isinstance(each, dict): raise ValueError('Wrong data') for key, value in each.items(): if value != '' and key not in new_fieldnames: new_fieldnames.append(str(key)) additional_fields = [x for x in new_fieldnames if x not in fieldnames] cleared_fields = [x for x in fieldnames if x not in new_fieldnames] if cleared_fields: print('deleted columns: ' + ', '.join(cleared_fields)) return [x for x in fieldnames if x in new_fieldnames] + additional_fields
def generate_img(sku, name, path='images', brand='logo'): file_name = f'{Sw.good_name(Sw.transliterate(name))}.jpg' full_path = f'{path}\\{file_name}' if os.path.exists(full_path): return file_name print(f'generate img {full_path}') fnt = PIL.ImageFont.truetype('C:\Windows\Fonts\Arial.ttf', 60) img = PIL.Image.open(f'{brand.lower().replace(" ", "_")}.png').convert('RGB') d = PIL.ImageDraw.Draw(img) d.text((60, 880), sku, fill=0, font=fnt) pathlib.Path(path).mkdir(parents=True, exist_ok=True) img.save(full_path, quality=100, optimize=True, progressive=True) return file_name
def download(url): try: page = requests.get(url, headers={'User-Agent': fake_useragent.UserAgent().chrome}) except: time.sleep(1) try: page = requests.get(url, headers={'User-Agent': fake_useragent.UserAgent().chrome}) except: print(f'=== download error {url}') return None if page.status_code != 200: print(f'=== page status code {page.status_code} for {url}') return None return page
def _create_web_driver(url): global _parsing_web_driver chrome_options = selenium.webdriver.ChromeOptions() chrome_options.add_argument("--start-maximized") prefs = {"profile.managed_default_content_settings.images": 2} # prefs = {} chrome_options.add_experimental_option("prefs", prefs) _parsing_web_driver = selenium.webdriver.Chrome( 'C:\\Users\\Administrator\\Documents\\_python\\webdriver\\chromedriver.exe', options=chrome_options) _parsing_web_driver.implicitly_wait(10) _parsing_web_driver.get(url) input('продолжить?') print('дальше')
def tmp(): # need for "Optimize imports" time() urllib() bs4() Category() Deepl() FindDigits() Html() LoadDictFromFile() Parsing() Product() SaveDictToFile() Sw() WorkWithJSON() print() datetime() quote() urljoin()
def _xls_import(filename, maincolumn, language, optimize, recognize, title_row, first_row): res = {} sheet = xlrd.open_workbook(filename).sheet_by_index(0) # Todo titles = __titles(__xls_titles(sheet, optimize, title_row), language, optimize) index = __find_index(maincolumn, titles) first_row = max(first_row, title_row + 1) if title_row else first_row for a in range(first_row - 1, sheet.nrows): row = [ __correct(__xlrd_get_value(sheet.cell(a, col)), optimize) for col in range(0, len(titles)) ] name = str(row[index] if index is not None else a + 1) if name: res[name] = {titles[i]: row[i] for i in range(0, len(titles))} rows_count = sheet.nrows - title_row if title_row else sheet.nrows print( f"{filename} / {rows_count} lines / {len(res)} loaded / {rows_count - len(res)} lost / ", end='') if recognize: _recognize_data(res) return res
def download_file_from_web(url, name, path='images'): if not name: name = urllib.parse.unquote(url[url.rfind('/') + 1:url.rfind('.')]) name = Sw.good_name(name) cache_path = Sw.get_cache_path(url) right_part = url[url.rfind('/') + 1:] if '?' in right_part: right_part = right_part[:right_part.rfind('?')] file_type = right_part[right_part.rfind('.'):].lower() if '.' in right_part else '' if file_type == '.jpeg': file_type = '.jpg' if 'treston' in path and not file_type: file_type = '.pdf' if len(file_type) > 4 or not file_type: print(f'=== bad file_type "{file_type}" in url {url}') return '' # raise ValueError new_path = f'{path}\\{name}{file_type}' if os.path.exists(cache_path) and os.path.exists(new_path): if os.stat(cache_path).st_size == os.stat(new_path).st_size: print(f'do nothing {url}', only_debug=True) else: # print(f'st_size: {os.stat(cache_path).st_size} != {os.stat(new_path).st_size}') if _copyfile(url, cache_path, new_path, file_type, path) is None: return '' elif os.path.exists(cache_path) and not os.path.exists(new_path): if _copyfile(url, cache_path, new_path, file_type, path) is None: return '' else: print(f'download file {url}') urllib3.disable_warnings() page = download(url) if page is None: return '' if '\\' in cache_path: pathlib.Path(cache_path[:cache_path.rfind('\\')]).mkdir(parents=True, exist_ok=True) with open(cache_path, 'wb') as file: file.write(page.content) if _copyfile(url, cache_path, new_path, file_type, path) is None: return '' return new_path.replace(f'{path}\\', '').lower()
def _xlsx_import(filename, maincolumn, language, optimize, recognize, title_row, first_row): res = {} sheet = openpyxl.load_workbook(filename).active titles = __titles(__xlsx_titles(sheet, optimize, title_row), language, optimize) index = __find_index(maincolumn, titles) first_row = max(first_row, title_row + 1) if title_row else first_row for a in range(first_row, sheet.max_row + 1): row = [ __correct(sheet.cell(row=a, column=col).value, optimize) for col in range(1, len(titles) + 1) ] name = str(row[index] if index is not None else a) if name: res[name] = {titles[i]: row[i] for i in range(0, len(titles))} rows_count = sheet.max_row - title_row if title_row else sheet.max_row print( f"{filename} / {rows_count} lines / {len(res)} loaded / {rows_count - len(res)} lost / ", end='') if recognize: _recognize_data(res) return res
def _csv_import(filename, maincolumn, language, optimize, recognize, delimiter, title_row, first_row): res = {} with codecs.open(filename, 'r', encoding='utf-8') as file: reader = csv.reader(file, delimiter=delimiter, quotechar='"') data = [row for row in reader] titles = __titles(data[title_row - 1], language, optimize) if title_row else alphabet(len(data[0])) index = __find_index(maincolumn, titles) for a, row in enumerate(data[first_row - 1:]): if not len(row): continue name = str( __correct(row[index], optimize) if index is not None else a + 2) if name: res[name] = { titles[i]: __correct(row[i], optimize) for i in range(0, len(titles)) } print( f"{filename} / {len(data) - 1} lines / {len(res)} loaded / {len(data) - 1 - len(res)} lost / ", end='') if recognize: _recognize_data(res) return res
def get_htmls_from_webdriver(url, file_name, additional_func=None): global _parsing_web_driver if not _parsing_web_driver: _create_web_driver(url) try: _parsing_web_driver.get(url) except: try: _parsing_web_driver.get(url) except: print(f'=== ошибка скачивания {url}') return '' time.sleep(2) html_text = _parsing_web_driver.page_source # if '404 - File or directory not found.' in html_text: # print(f'=== code 404 {url}') # return '' if additional_func is not None: additional_func(_parsing_web_driver) html_text = _parsing_web_driver.page_source save_text_to_file(file_name, html_text) return html_text
def load(filename, maincolumn=None, language=None, optimize=False, recognize=False, delimiter=',', title_row=1, first_row=2): if filename.endswith('.csv'): return _csv_import(filename, maincolumn, language, optimize, recognize, delimiter, title_row, first_row) elif filename.endswith('.xls'): return _xls_import(filename, maincolumn, language, optimize, recognize, title_row, first_row) elif filename.endswith('.xlsx') or filename.endswith('.xlsm'): try: return _xlsx_import(filename, maincolumn, language, optimize, recognize, title_row, first_row) except KeyError: print('Error: bad file format. Will try to use xls instead') return _xls_import(filename, maincolumn, language, optimize, recognize, title_row, first_row) else: raise ValueError(f'Wrong filetype: {filename}')
def check_sku(sku, url, good_symbols=' .-+/'): if not sku: print(f'=== no sku {url}') return False if len(sku) > 21: print(f'=== длинный артикул {sku} {url}') for each in sku: if each.isalpha() or each.isdecimal() or each in good_symbols: continue else: print(f'=== спец.символы в артикуле {sku} {url}') return False return True
def wrapper2(*args, **kwargs): start_time = time.time() start_file_name = inspect.getfile(func) if '/' in start_file_name: start_file_name = start_file_name[start_file_name.rfind('/') + 1:] if '\\' in start_file_name: start_file_name = start_file_name[start_file_name.rfind('\\') + 1:] SwPrint.SwPrint(debug=debug, prj_name=start_file_name) print('start') result = func(*args, **kwargs) global _parsing_web_driver if _parsing_web_driver: _parsing_web_driver.quit() print(f'done in {GlobalFunctions.generate_time_string(time.time() - start_time)}') print(f'end') SwPrint.SwPrint.save_log_to_file() return result
def get_htmls_from_web(url, simple=False, additional_func=None): result = [] file_name = Sw.get_cache_path(url, html=True) if os.path.exists(file_name): print(f'use exist {url}', only_debug=True) result.append(read_file(file_name)) i = 1 while True: file_name_dop = f'{file_name}_{i + 1}.html' if not os.path.exists(file_name_dop): break print(f'use exist dop {file_name_dop}', only_debug=True) result.append(read_file(file_name_dop)) i += 1 else: print(f'download {url}') if simple: result.append(get_simple_html(url, file_name)) else: result.append(get_htmls_from_webdriver(url, file_name, additional_func)) return result
def correct_images_sources(soup, source_url=''): for tag in soup.find_all('img'): src = urllib.parse.urljoin(source_url, tag.get('src')) print(f'got image {src}') tag.attrs.clear() tag.attrs['src'] = src
def __change_file_type(file_name, file_type): if '.' in file_name: file_name = file_name.replace(file_name[file_name.rfind('.'):], f'.{file_type}') else: print(f'=== download_imgs error - no file type {file_name}') return file_name
def __check_data(data, filename): if data: return True print(f'{filename} / nothing to save / ', end='') return False