Beispiel #1
0
def rebuild_mf_index():
    global driver
    url = credentials['url_module_mf'].strip() + '&token=' + token
    driver.get(url)
    refresh = WebDriverWait(driver, 300).until(EC.presence_of_element_located((By.CSS_SELECTOR, '#msg-refresh_ocmod_cache > span')))
    sleep(6)
    crossparser_tools.write_to_log(refresh.text)
Beispiel #2
0
def checking_all_diggers():
    global current_site
    global digger_id

    is_done = True

    for attr, value in websites.items():
        digger_id = value
        current_site = attr

        session_info = check_status(digger_id, False)
        if session_info['state'] == 'running':
            is_done = False
        else:
            parse_session(session_info)

            link_to_parse = get_nextlink_forsite(current_site)

            if link_to_parse != '':
                crossparser_tools.write_to_log('start digger on link: ' +
                                               link_to_parse +
                                               ', digger_id: ' + digger_id)
                start_digger(digger_id, link_to_parse)
                is_done = False

    sleep(10)
    if is_done == False:
        checking_all_diggers()
Beispiel #3
0
def start_digger(digger_id, link_to_parse):
    headers = {
        'Authorization': 'Token ' + token,
        'Content-type': 'application/json'
    }
    URL = 'https://www.diggernaut.com/api/diggers/' + digger_id + '/start'
    PARAMS = '{"variables":{"target_page":"' + link_to_parse + '"}}'

    r = requests.post(url=URL, headers=headers, data=PARAMS)
    data = r.json()
    crossparser_tools.write_to_log(data)
Beispiel #4
0
def start_all_diggers():
    global current_site
    global digger_id

    for attr, value in websites.items():
        digger_id = value
        current_site = attr

        link_to_parse = get_nextlink_forsite(current_site)

        crossparser_tools.write_to_log('start parsing: ' + current_site +
                                       'digger_id: ' + digger_id)

        start_digger(digger_id, link_to_parse)
Beispiel #5
0
def make_csv(file):

    global items_counter_parsed
    global csv_out_data_counter
    csv_out_data_counter = 0

    file_lines = crossparser_tools.read_csv(file, True)

    global table_titles_list
    table_titles_list = crossparser_tools.table_titles_list

    filename = file + '-import.csv'

    with open(filename, 'w+', newline='', encoding="utf8") as csvexportfile:

        #Make header
        row_out = []
        for attr, value in export_fields.items():
            row_out.append(value)

        csvexportfile.write(';'.join(row_out) + '\n')

        #Parse lines
        for row in file_lines:
            if row == '':
                continue

            items_counter_parsed += 1

            row = row.strip().replace('\n', '').replace('\r', '')

            if credentials['is_server'] == 'no':
                parse_row(row, csvexportfile)
            if credentials['is_server'] == 'yes':
                try:
                    parse_row(row, csvexportfile)
                except Exception as e:
                    crossparser_tools.write_to_log(
                        'failed to parse row of file:' + file + '. row: ' +
                        row)
                    crossparser_tools.write_to_log(e)

    if csv_out_data_counter == 0:
        crossparser_tools.write_to_log('Failed to make csv file of ' +
                                       current_site + '. No entries ')
        os.remove(filename)
        return

    with open(temp_folder + 'files_prod_import.txt',
              'a',
              newline='',
              encoding="utf8") as files_toimport:
        files_toimport.write(filename + '\n')

    crossparser_tools.write_to_log('Made csv file with ' +
                                   str(csv_out_data_counter) + ' items of ' +
                                   current_site + '. Saved to file: ' +
                                   filename)
Beispiel #6
0
def check_parse_new_link(site, is_done):
    #Start parse new link:
    global websites_parsed
    global tabs_delay

    link = get_nextlink_forsite(site)
    if link != '':
        tabs_delay[site] = datetime.datetime.now()
        parse_link(site, link)
        is_done = False
    else:
        crossparser_tools.write_to_log('Done parse all links for site: ' +
                                       site)
        #window_handles.pop(site)
        websites_parsed[site] = True

    return is_done
Beispiel #7
0
def import_files():

    global driver

    if os.path.isfile(files_prod_import):
        if os.path.isfile(files_categ_import):

            parse_websites()

            if len(categs_files) == 0 or len(categs_files) == 0:
                crossparser_tools.write_to_log('Nothing to import')
                return

            options = webdriver.ChromeOptions()

            if credentials['is_server'] == 'no':
                chromedriver_path = config_folder + 'chromedriver.exe'
                options.add_argument('--window-size=1200,700')

            if credentials['is_server'] == 'yes':
                chromedriver_path = config_folder + 'chromedriver'
                options.add_argument('--no-sandbox')
                options.add_argument("--disable-dev-shm-usage");
                options.add_argument('--headless')
                options.add_argument('--disable-gpu')

                import purge_db


            driver = webdriver.Chrome(chromedriver_path, chrome_options=options)

            open_web_site()


            for file in categs_files:
                import_categ_file(file)


            for file in products_files:
                import_products_file(file)



            rebuild_mf_index()

            driver.quit()
Beispiel #8
0
def parse_new():

    file_of_raw_catalogs = crossparser_tools.file_of_raw_catalogs

    files_to_parse = {}

    with open(file_of_raw_catalogs, 'r', newline='',
              encoding="utf8") as files_toimport:
        for line in files_toimport:
            line = line.split('$$')
            files_to_parse[line[1].strip()] = line[0].strip().replace('\n', '')

    if len(files_to_parse) == 0:
        return

    for file, site in files_to_parse.items():
        if os.path.isfile(file):
            crossparser_tools.write_to_log('Start processing file: ' + file)
            global current_site
            current_site = site
            make_csv(file)
Beispiel #9
0
def parse_link(site, link):

    try:

        global driver
        global current_links
        current_links[site] = link

        url = websites[site]
        driver.get(url.strip())

        input = driver.find_element_by_css_selector(
            'div.inputfields input.textbox.urlinput')
        input.send_keys(link)

        strt_btn = driver.find_element_by_css_selector('#startBtn')

        if credentials['is_demo'] == 'yes':
            strt_btn = driver.find_element_by_css_selector('#startDemoBtn')

        driver.execute_script(
            "arguments[0].scrollIntoView();$('#header').remove();", strt_btn)
        strt_btn.click()

        WebDriverWait(driver, 33).until(
            EC.presence_of_element_located((By.CSS_SELECTOR, '#progressBar')))
        crossparser_tools.write_to_log('Initiated parsing of ' + link)

    except Exception as e:
        crossparser_tools.write_to_log('Failed initiate parsing of ' + link)
        crossparser_tools.write_to_log(str(e))
Beispiel #10
0
def import_products_file(file):

    global driver

    crossparser_tools.write_to_log('Start import ' + file)
    url = credentials['url_module'].strip() + 'app_product&token=' + token
    driver.get(url)

    login_btn = driver.find_element_by_id('link_tab_import')
    login_btn.click()

    input_btn = driver.find_element_by_css_selector('#form_product_import > div:nth-child(1) > div:nth-child(1) > div:nth-child(19) > div > input[type="file"]')
    input_btn.send_keys(file)

    input_btn = driver.find_element_by_css_selector('#form_product_import > div:nth-child(2) > div > div > button')
    
    actions = ActionChains(driver)
    actions.move_to_element(input_btn)
    actions.click(input_btn)
    actions.perform()

    WebDriverWait(driver, 3600).until(EC.presence_of_element_located((By.CSS_SELECTOR, '#content > div.container-fluid.csvprice_pro_container > div.alert.alert-success.alert-dismissible > i')))
    #print('done file:', file)
    succ_text = driver.find_element_by_css_selector('#content > div.container-fluid.csvprice_pro_container > div.alert.alert-success.alert-dismissible')
    crossparser_tools.write_to_log('Successfully imported ' + file)
    crossparser_tools.write_to_log(succ_text.text.replace('μ', '').replace('\n\n', '\n').replace('×', ''))
    sleep(1)
Beispiel #11
0
def download_catalogs(site, is_save):
    global driver

    #Dowload primary prices catalog

    download_btn = WebDriverWait(driver, 3600).until(
        EC.element_to_be_clickable(
            (By.CSS_SELECTOR, "#content .products-menu .export-button")))
    driver.execute_script(
        "arguments[0].scrollIntoView();$('#header').remove();", download_btn)
    download_btn.click()
    before = dict([(f, None) for f in os.listdir(temp_folder)])
    final_download_btn = WebDriverWait(driver, 100).until(
        EC.element_to_be_clickable((By.CSS_SELECTOR, "#exportBtn")))
    driver.execute_script(
        "arguments[0].scrollIntoView();$('#header').remove();",
        final_download_btn)
    final_download_btn.click()
    sleep(5)
    after = dict([(f, None) for f in os.listdir(temp_folder)])
    added = [f for f in after if not f in before]

    if len(added) == 0:
        crossparser_tools.write_to_log('Failed to download file. Website: ' +
                                       site)
        current_link = current_links[site]
        crossparser_tools.write_to_log('Filed to parse link: ' + current_link)
        driver.save_screenshot(
            "/var/www/html/boots-market/crossparser/temp/screenshot" + site +
            ".png")
        return ''

    filename = temp_folder + ''.join(added)

    wait_for_download(filename)

    if is_save == True:
        global counter_links_parsed
        counter_links_parsed += 1
        crossparser_tools.write_to_log('Downloaded primary file of ' + site +
                                       '. Saved to ' + filename)
        with open(file_of_raw_catalogs, 'a', newline='',
                  encoding="utf8") as files_toimport:
            files_toimport.write(site + '$$' + filename + '\n')
    else:
        crossparser_tools.write_to_log('Downloaded secondary file of ' + site +
                                       '. Saved to ' + filename)

    return filename
Beispiel #12
0
def clear_db():

    global sql_query
    parse_sql_query()
    #print(sql_query)

    #delete temp files
    if os.path.isfile(data_folder + 'partner_links'):
        os.remove(data_folder + 'partner_links')

    cat_menu = data_folder + 'category_menu.txt'
    if os.path.isfile(cat_menu):
        os.remove(cat_menu)
        open(cat_menu, 'a').close()

    mydb = mysql.connector.connect(host='localhost',
                                   user=credentials['php_login'],
                                   passwd=credentials['php_password'],
                                   database=credentials['php_db'])

    mycursor = mydb.cursor()

    sql_queries = sql_query.split('\n')
    for query in sql_queries:
        mycursor.execute(query)
    #sql_query = 'TRUNCATE bootsmarketdb.`oc_product_to_category`;'
    #mycursor.execute(sql_query, multi=True)

    sql_query = 'SELECT * FROM bootsmarketdb.`oc_product_option_value`;'
    mycursor.execute(sql_query)

    myresult = mycursor.fetchall()
    #print(myresult)

    if len(myresult) == 0:
        crossparser_tools.write_to_log('db successfully purged')
    else:
        crossparser_tools.write_to_log('db purge failed')
Beispiel #13
0
def parse_session(session_info):

    if len(session_info) == 0:
        crossparser_tools.write_to_log('Digger ' + str(digger_id) +
                                       ' doesnt respond properly')
        return

    session_id = session_info['id']
    print('session_id:', session_id)

    session_data = get_session_data(digger_id, session_id)
    crossparser_tools.write_to_log('Successfully retrieved session ' +
                                   str(session_id) + ' data of digger ' +
                                   str(digger_id) + '. Items in session: ' +
                                   str(len(session_data)))

    global items_counter_parsed
    items_counter_parsed += len(session_data)
    #print('session_data', len(session_data))

    #parse_categories()

    make_csv(session_data, current_site)
Beispiel #14
0
def make_csv(data, current_site):

    filename = temp_folder + current_site + '-' + digger_id + '-import.csv'

    with open(filename, 'w+', newline='', encoding="utf8") as csvexportfile:

        csv_out_data = json_to_csv(data, csvexportfile)
        if csv_out_data == '':
            crossparser_tools.write_to_log('No csv_out_data fetched')
            return

        #print(csv_out_data)
        global csv_out_data_counter
        csv_out_data_counter = 0

        for row in csv_out_data:
            #for i in range(len(row)):
            #row[i] = str(row[i])

            #csvexportfile.write(';'.join(row) + '\n')
            parse_row(row, csvexportfile)

    if csv_out_data_counter == 0:
        crossparser_tools.write_to_log('Failed to make csv file of ' +
                                       current_site +
                                       '. No entries. Digger id: ' + digger_id)
        return

    with open(temp_folder + 'files_prod_import.txt',
              'a',
              newline='',
              encoding="utf8") as files_toimport:
        files_toimport.write(filename + '\n')

    crossparser_tools.write_to_log('Made csv file with ' +
                                   str(csv_out_data_counter) + ' items of ' +
                                   current_site + '. Digger id: ' + digger_id +
                                   '. Saved to file: ' + filename)
Beispiel #15
0
def parse_row(row, csvwriter):

    #print("in row: ", row)
    if row == '':
        return

    global table_titles_list
    n_import = len(table_titles_list)
    n_export = len(export_fields)
    row_out = [''] * n_export

    row = row.split(';')

    #First 4 cells is usually categories
    row[0] = row[0] + '|' + row[1] + '|' + row[2] + '|' + row[3]

    #Format export row
    i = 0
    for cell in row:

        if i >= n_import:
            print("col out of range of header: ", cell)
            break

        for attr, value in export_fields.items():
            if attr == table_titles_list[i]:
                row_out[export_fields_nums[attr]] = cell.replace('"',
                                                                 '').replace(
                                                                     '\n', '')

        i = i + 1

    if print_rows_formating == True:
        print("row formated for export: ", row_out)

    #Little customize fields for special stores
    if current_site == 'tervolina.ru':
        index = export_fields_array.index('_MANUFACTURER_')
        row_out[index] = 'Tervolina'

    #Leave product if empty fields of:
    index = export_fields_array.index('_MANUFACTURER_')
    prod_brand = row_out[index].replace("'", '').replace('"', '')
    prod_categs = row[0].replace('|', '').replace("'", '').replace('"', '')
    index = export_fields_array.index('_PRICE_')
    prod_price = row_out[index].replace("'", '').replace('"', '')
    index = export_fields_array.index('_NAME_')
    prod_name = row_out[index].replace("'", '').replace('"', '')

    if prod_name == '' or prod_name == ' ' or prod_name is None:
        print('no prod name')
        return ''

    if prod_categs == '' or prod_categs == ' ' or prod_categs is None:
        print('no prod_categs for ' + prod_name)
        return ''

    if prod_brand == '' or prod_brand == ' ' or prod_brand is None:
        print('no prod_brand for ' + prod_name)
        return ''

    if prod_price == '' or prod_price == ' ' or prod_price is None:
        print('no prod_price for ' + prod_name)
        return ''

    # Customize special fields (such as Size, etc)
    # <Optional loop>
    # From 'cell' to 'row_out[i]'
    i = -1
    #print('row_out', row_out)

    for cell in row_out:
        i += 1
        current_row_title = export_fields_array[i]

        if current_row_title == '_DESCRIPTION_' or current_row_title == '_NAME_' or current_row_title == '_MANUFACTURER_':
            row_out[i] = '"' + cell.strip().replace("'", '') + '"'

        #Format Size:
        if current_row_title == '_OPTIONS_':
            new_size_cell = '"'
            cell = cell.replace('\n', '').replace('\r', '')
            sizes_arr = cell.split('|')
            for size in sizes_arr:
                new_size_cell += 'select|Размер|'
                new_size_cell += size
                new_size_cell += '|1|1000|1|+|0.0000|+|0|+|0.00\n'

            row_out[i] = new_size_cell + '"'

        #Format Price:
        if current_row_title == '_PRICE_':
            row_out[i] = crossparser_tools.get_only_nums(cell)

        if current_row_title == '_SPECIAL_':
            special_price = crossparser_tools.get_only_nums(cell)
            if special_price != '':
                price_index = export_fields_array.index('_PRICE_')
                curr_price = row_out[price_index]
                if special_price > curr_price:
                    row_out[price_index] = special_price
                    special_price = crossparser_tools.get_only_nums(curr_price)
                    row_out[i] = '1,0,' + str(
                        special_price) + '.00,0000-00-00,0000-00-00'

        #Form unique SKU
        if current_row_title == '_SKU_':
            #Save SKU to _UPC_
            upc_index = export_fields_array.index('_UPC_')
            row_out[upc_index] = cell
            #Change sku
            url_index = export_fields_array.index('_LOCATION_')
            url = row_out[url_index]
            #id_from_url = crossparser_tools.get_uniqid_from_url(url, current_site)
            id_from_url = crossparser_tools.get_rand_uniqid(10)

            if str(cell) not in id_from_url:
                id_from_url += str(cell)
            row_out[i] = id_from_url
            global global_prod_id
            global_prod_id = id_from_url

        #Copy SKU to Model:
        if current_row_title == '_MODEL_':
            sku_index = export_fields_array.index('_SKU_')
            curr_sku = row_out[sku_index]
            row_out[i] = curr_sku

        #SEO URL:
        if current_row_title == '_SEO_KEYWORD_':
            index = export_fields_array.index('_SKU_')
            prod_sku = row_out[index]
            index = export_fields_array.index('_NAME_')
            prod_name = row_out[index]

            seo_url = crossparser_tools.to_seo_url(prod_name)

            row_out[i] = seo_url + '-' + prod_sku.lower()

        #Parse and download all images:
        if current_row_title == '_IMAGES_':
            #Decline product without imgs:
            if cell == '':
                return

            imgs = cell.split('|')
            checked_imgs = []
            for img in imgs:
                check = image_check(img)
                if check != '':
                    checked_imgs.append(check)

            if len(checked_imgs) == 0:
                row_out[i] = ''
                #Decline product without imgs:
                return
            else:
                row_out[i] = (',').join(checked_imgs)

        #Set primary (first) image:
        if current_row_title == '_IMAGE_':
            imgs_index = export_fields_array.index('_IMAGES_')
            imgs = row_out[imgs_index]
            if imgs == '':
                sku_index = export_fields_array.index('_LOCATION_')
                imgs = row_out[sku_index]
                crossparser_tools.write_to_log(
                    'No images collected for product: ' + curr_sku +
                    ' (failed to download)')
                #Decline product without imgs:
                return

            imgs = imgs.split(',')
            row_out[i] = imgs[0]
            imgs.pop(0)
            row_out[imgs_index] = ','.join(imgs)

        if current_row_title == '_QUANTITY_':
            row_out[i] = str(99999)

        #Create category:
        if current_row_title == '_CATEGORY_ID_':
            #print(cell)
            index = export_fields_array.index('_MANUFACTURER_')
            prod_brand = row_out[index]
            index = export_fields_array.index('_NAME_')
            prod_name = row_out[index]
            prod_categs = cell
            index = export_fields_array.index('_PRICE_')
            prod_price = 999999
            if row_out[index] != '':
                prod_price = int(row_out[index])

            index = export_fields_array.index('_IMAGE_')
            prod_image = row_out[index]

            categs_ids = create_category(prod_categs, prod_brand, prod_name,
                                         prod_price, prod_image)
            if categs_ids is None:
                categs_ids = ''

            if categs_ids == '':
                #Decline product without categories:
                url_index = export_fields_array.index('_LOCATION_')
                url = row_out[url_index]
                crossparser_tools.write_to_log(
                    'No categories created for product: ' + url)
                return

            row_out[i] = ','.join(categs_ids)

        #Set up attributes: Brand, Gender, Season:
        if current_row_title == '_ATTRIBUTES_':
            global global_prod_gender
            row = '"Обувь|Пол|' + global_prod_gender.replace('"', '') + '\n'

            index = export_fields_array.index('_MANUFACTURER_')
            prod_brand = row_out[index].replace('"', '')
            row += 'Обувь|Бренд|' + prod_brand + '\n'

            prod_season = 'Лето'
            row += 'Обувь|Сезон|' + prod_season + '"'

            row_out[i] = row

        #Save this store link
        if current_row_title == '_EAN_':
            row_out[i] = current_site

        if current_row_title == '_LOCATION_':
            link = cell.replace(current_site,
                                '').replace('https',
                                            '').replace('http',
                                                        '').replace('www', '')
            link_id = crossparser_tools.get_only_letters(link)
            with open(data_folder + 'partner_links',
                      'a+') as partner_links_file:

                link_no_http = cell.replace('https://', '').replace(
                    'http://', '').replace('www.', '')
                if current_site not in partner_links:
                    crossparser_tools.write_to_log(
                        'No partner link for site: ' + current_site)
                    return

                part_link = partner_links[current_site]
                deeplink = part_link + '?ulp=http%3A%2F%2F' + link_no_http

                link_row = link_id + '$$' + deeplink + '\n'
                partner_links_file.write(link_row)

            row_out[i] = link_id

    # </>

    #print("out row: ", ';'.join(row_out) + '\n')

    for i in range(len(row_out)):
        row_out[i] = str(row_out[i])

    csvwriter.write(';'.join(row_out) + '\n')

    global items_counter_converted
    items_counter_converted += 1
    global csv_out_data_counter
    csv_out_data_counter += 1
Beispiel #16
0
                chromedriver_path = config_folder + 'chromedriver.exe'
                options.add_argument('--window-size=1200,700')

            if credentials['is_server'] == 'yes':
                chromedriver_path = config_folder + 'chromedriver'
                options.add_argument('--no-sandbox')
                options.add_argument("--disable-dev-shm-usage")
                options.add_argument('--headless')
                options.add_argument('--disable-gpu')

                import purge_db

            driver = webdriver.Chrome(chromedriver_path,
                                      chrome_options=options)

            open_web_site()

            for file in categs_files:
                import_categ_file(file)

            for file in products_files:
                import_products_file(file)

            rebuild_mf_index()

            driver.quit()


crossparser_tools.write_to_log(' ******* Start import files *******')
import_files()
Beispiel #17
0
def start_checking():

    is_done = True
    global tabs_delay
    global websites_parsed

    for site, handle in window_handles.items():
        try:
            if websites_parsed[site] == True:
                continue

            driver.switch_to.window(handle)

            parse_time = datetime.datetime.now() - tabs_delay[site]

            #Check if parsing hasnt started
            if '/start/' in driver.current_url:
                if parse_time > datetime.timedelta(minutes=1.0):
                    current_link = current_links[site]
                    crossparser_tools.write_to_log(
                        'Parsing hasnt started. Filed to parse link: ' +
                        current_link)

                    #Start parse new link:
                    is_done = check_parse_new_link(site, is_done)

            #Check if parsing of link last too long
            if parse_time > max_parse_time:
                print('parsing of website ' + site + 'took more than hour')
                if check_exists_by_css_selector('#cancelBtn'):
                    btn = WebDriverWait(driver, 30).until(
                        EC.element_to_be_clickable(
                            (By.CSS_SELECTOR, "#cancelBtn")))
                    driver.execute_script(
                        "arguments[0].scrollIntoView();$('#header').remove();",
                        btn)
                    btn.click()
                    sleep(2)
                #Start parse new link:
                is_done = check_parse_new_link(site, is_done)
            else:
                pass
                #print('still parsing ' + str(parse_time))

            if check_exists_by_css_selector(
                    '#content .products-menu .export-button') == False:
                #Tab still parsing, skip
                is_done = False
            else:

                #Parsing complete. Download catalog
                filename1 = download_catalogs(site, True)

                #Dowload secondary prices catalog
                if filename1 != '':
                    btn = WebDriverWait(driver, 30).until(
                        EC.element_to_be_clickable((
                            By.CSS_SELECTOR,
                            "div.ui-dialog.ui-widget button.ui-dialog-titlebar-close"
                        )))
                    driver.execute_script(
                        "arguments[0].scrollIntoView();$('#header').remove();",
                        btn)
                    btn.click()
                    btn = WebDriverWait(driver, 30).until(
                        EC.element_to_be_clickable(
                            (By.CSS_SELECTOR, "#changePrice")))
                    driver.execute_script(
                        "arguments[0].scrollIntoView();$('#header').remove();",
                        btn)
                    btn.click()
                    btn = WebDriverWait(driver, 30).until(
                        EC.element_to_be_clickable(
                            (By.CSS_SELECTOR, "#price > option:nth-child(2)")))
                    driver.execute_script(
                        "arguments[0].scrollIntoView();$('#header').remove();",
                        btn)
                    btn.click()
                    btn = WebDriverWait(driver, 30).until(
                        EC.element_to_be_clickable(
                            (By.CSS_SELECTOR, "#applyChangePriceBtn")))
                    driver.execute_script(
                        "arguments[0].scrollIntoView();$('#header').remove();",
                        btn)
                    btn.click()
                    sleep(2)
                    filename2 = download_catalogs(site, False)
                    #unite prices:
                    if filename2 != '':
                        unite_prices(filename1, filename2)

                #Start parse new link:
                is_done = check_parse_new_link(site, is_done)

        except Exception as e:
            crossparser_tools.write_to_log(
                'failed to check or initiate link of site: ' + site)
            current_link = current_links[site]
            crossparser_tools.write_to_log('Failed to parse link: ' +
                                           current_link)
            crossparser_tools.write_to_log(str(e))

    sleep(5)
    if is_done == False:
        start_checking()
Beispiel #18
0
def json_to_csv(data, csvexportfile):

    csv_out_data = []
    csv_header = []

    if len(data) == 1:
        crossparser_tools.write_to_log(data['detail'])
        return ''

    for product in data:
        product = product['post']
        #print('product: ', product['prod_name'])

        row_out = [''] * max([len(product), len(csv_header)])

        for attr, value in product.items():
            if attr not in csv_header:
                csv_header.append(attr)

            if attr in csv_header:
                #Optional PRE-processing of input fields

                if attr == 'products_imgs' or attr == 'prod_categories':
                    value = ','.join(value)

                if attr == 'all_native_sizes':
                    # Clear disabled sized from all natives:
                    if 'disabled_native_sizes' in product:
                        disabled_native_sizes = product[
                            'disabled_native_sizes']
                        if disabled_native_sizes != '':
                            for size in disabled_native_sizes:
                                if size in value:
                                    value.pop(value.index(size))

                    value = '|'.join(value)

                if current_site == 'sportmaster.ru':
                    if attr == 'products_imgs':
                        value = value.split("'")
                        imgs = []
                        for chunk in value:
                            if 'https://cdn.sptmr.ru' in chunk:
                                chunk = chunk.replace(
                                    'resize_cache/',
                                    '').replace('/${width}_${height}_1', '')
                                imgs.append(chunk)

                        value = (',').join(imgs)
                        #print(value)

                if attr == 'products_imgs':
                    imgs = value.split(',')
                    checked_imgs = []
                    for img in imgs:
                        check = image_check(img)
                        if check != '':
                            checked_imgs.append(check)

                    if len(checked_imgs) == 0:
                        value = ''
                    else:
                        value = (',').join(checked_imgs)

                row_out[csv_header.index(attr)] = value

        #print('csv_header: ', csv_header)
        #print('row_out: ', row_out)
        csv_out_data.append(row_out)

    parse_header(csv_header, csvexportfile)
    return csv_out_data
Beispiel #19
0
def parse_row(row, csvwriter):

    global csv_out_data_counter

    n_import = len(table_titles_list)
    n_export = len(export_fields)
    i = 0
    row_out = [''] * n_export

    #Format export row

    for cell in row:

        if i >= n_import:
            print("col out of range of header: ", cell)
            break

        for attr, value in export_fields.items():
            if attr == table_titles_list[i]:
                row_out[export_fields_nums[attr]] = cell.replace('"',
                                                                 '').replace(
                                                                     '\n', '')

        i = i + 1

    if print_rows_formating == True:
        print("row formated for export: ", row_out)

    # Customize special fields (such as Size, etc)
    # <Optional loop>
    i = -1
    for cell in row_out:
        i += 1
        current_row_title = export_fields_array[i]

        if current_row_title == '_DESCRIPTION_' or current_row_title == '_NAME_' or current_row_title == '_MANUFACTURER_':
            row_out[i] = '"' + cell.strip() + '"'

        #Format Size:
        if current_row_title == '_OPTIONS_':
            new_size_cell = '"'
            cell = cell.replace('\n', '').replace('\r', '')
            sizes_arr = cell.split('|')
            for size in sizes_arr:
                new_size_cell += 'select|Размер|'
                new_size_cell += size
                new_size_cell += '|1|1000|1|+|0.0000|+|0|+|0.00\n'

            row_out[i] = new_size_cell + '"'

        #Format Price:
        if current_row_title == '_PRICE_':
            row_out[i] = ''.join(re.findall(r'\d+', cell.replace(' ', '')))

        if current_row_title == '_SPECIAL_':
            special_price = ''.join(re.findall(r'\d+', cell.replace(' ', '')))
            if special_price != '':
                price_index = export_fields_array.index('_PRICE_')
                curr_price = row_out[price_index]
                row_out[price_index] = special_price
                special_price = '.'.join(
                    re.findall(
                        r'\d+',
                        curr_price.replace(' ',
                                           '').replace('\n',
                                                       '').replace('\r', '')))
                row_out[i] = '1,0,' + str(
                    special_price) + '.00,0000-00-00,0000-00-00'

        #Copy SKU to Model:
        if current_row_title == '_MODEL_':
            sku_index = export_fields_array.index('_SKU_')
            curr_sku = row_out[sku_index]
            row_out[i] = curr_sku

        #Set primary (first) image:
        if current_row_title == '_IMAGE_':
            sku_index = export_fields_array.index('_IMAGES_')
            curr_sku = row_out[sku_index]
            if curr_sku == '':
                sku_index = export_fields_array.index('_LOCATION_')
                curr_sku = row_out[sku_index]
                crossparser_tools.write_to_log(
                    'No images collected for product: ' + curr_sku)
                #Decline product without imgs:
                return

            curr_sku = curr_sku.split(',')
            row_out[i] = curr_sku[0]
            curr_sku.pop(0)
            row_out[sku_index] = ','.join(curr_sku)

        if current_row_title == '_QUANTITY_':
            row_out[i] = str(99999)

        #Create category:
        if current_row_title == '_CATEGORY_ID_':
            index = export_fields_array.index('_MANUFACTURER_')
            prod_brand = row_out[index]
            index = export_fields_array.index('_NAME_')
            prod_name = row_out[index]
            prod_categs = cell
            index = export_fields_array.index('_PRICE_')
            prod_price = 999999
            if row_out[index] != '':
                prod_price = int(row_out[index])

            index = export_fields_array.index('_IMAGE_')
            prod_image = row_out[index]

            categs_ids = create_category(prod_categs, prod_brand, prod_name,
                                         prod_price, prod_image)
            if categs_ids is None:
                categs_ids = ''
            row_out[i] = ','.join(categs_ids)

        #SEO URL:
        if current_row_title == '_SEO_KEYWORD_':
            index = export_fields_array.index('_SKU_')
            prod_sku = row_out[index]
            index = export_fields_array.index('_NAME_')
            prod_name = row_out[index]

            seo_url = to_seo_url(prod_name)

            row_out[i] = seo_url + '-' + prod_sku.lower()

        #Set up attributes: Brand, Gender, Season:
        if current_row_title == '_ATTRIBUTES_':
            global global_prod_gender
            row = '"Обувь|Пол|' + global_prod_gender.replace('"', '') + '\n'

            index = export_fields_array.index('_MANUFACTURER_')
            prod_brand = row_out[index].replace('"', '')
            row += 'Обувь|Бренд|' + prod_brand + '\n'

            prod_season = 'Лето'
            row += 'Обувь|Сезон|' + prod_season + '"'

            row_out[i] = row

        if current_row_title == '_LOCATION_':
            link_id = ''.join(re.findall(r'\w+',
                                         cell)).replace('http',
                                                        '').replace('www', '')
            print(link_id)

    # </>
    if print_rows_formating == True:
        print("out row: ", ';'.join(row_out) + '\n')
    for i in range(len(row_out)):
        row_out[i] = str(row_out[i])
    csvwriter.write(';'.join(row_out) + '\n')
    global items_counter_converted
    items_counter_converted += 1
    csv_out_data_counter += 1
Beispiel #20
0
def make_categories_csv():
    global categories_names
    global categories_ids
    global categories_max_id
    global categories_parent_ids

    if len(categories_names) == 0:
        crossparser_tools.write_to_log(
            'Failed to make categories csv. No entries')
        return

    res_eng = regex.search(r'\p{IsCyrillic}', 'lala')

    categs_for_menu = []
    categs_for_menu_urls = []
    categs_filename = temp_folder + 'category_export.csv'

    with open(temp_folder + 'files_categ_import.txt',
              'w+',
              newline='',
              encoding="utf8") as files_toimport:
        files_toimport.write(categs_filename)

    with open(categs_filename, 'w+', newline='', encoding="utf8") as cat_file:
        row_out = '_ID_;_PARENT_ID_;_NAME_;_META_H1_;_META_TITLE_;_META_KEYWORDS_;_META_DESCRIPTION_;_DESCRIPTION_;_IMAGE_;_SEO_KEYWORD_;_SORT_ORDER_\n'
        cat_file.write(row_out)

        #print('categories_qnt: ', categories_qnt)

        for i_cat in range(len(categories_names)):

            #Form up H1 header:
            categs = categories_names[i_cat].split('|')

            main_subcat = ''
            if len(categs) == 2:
                main_subcat = ';'.join(categs)

            for i in range(len(categs)):
                res = regex.search(r'\p{IsCyrillic}', categs[i])
                if res != res_eng:
                    categs[i] = categs[i].lower()

            for i in range(len(categs)):
                for j in range(i + 1, len(categs)):
                    if i > (len(categs) - 1) or j > (len(categs) - 1):
                        break
                    if categs[i] in categs[j]:
                        categs.pop(i)

            seo_title = ''
            is_brand_cat = False

            if categs[0] == 'бренды':
                categs.pop(0)
                is_brand_cat = True
                #Form up SEO Title for brands:
                if len(categs) > 0:
                    seo_title = 'Купить ' + ' '.join(
                        categs) + '. Каталог ' + categs[0] + ' оригинал, цены'

            if len(categs) > 0:
                #Form up SEO Title for gender categs:
                categs[0] = categs[0].capitalize()
                if is_brand_cat == False:
                    seo_title = ' '.join(categs) + ' купить по цене от ' + str(
                        categories_lower_price[
                            categories_ids[i_cat]]) + ' руб. Фото, каталог'

            h1header = ' '.join(categs)

            #SEO URL:
            seo_url = to_seo_url(h1header)

            #Categs for menu:
            if main_subcat != '':
                if main_subcat not in categs_for_menu:
                    num = str(categories_qnt[categories_ids[i_cat]])
                    categs_for_menu.append(
                        [main_subcat.capitalize(), seo_url, num])

            row_out = [''] * 11
            row_out[0] = str(categories_ids[i_cat])
            row_out[1] = str(categories_parent_ids[i_cat])
            row_out[2] = categories_names[i_cat]
            #print('new category:', categories_names[i_cat])

            row_out[3] = h1header
            row_out[4] = seo_title

            row_out[8] = str(categories_images[categories_ids[i_cat]])
            row_out[9] = seo_url
            row_out[10] = str(categories_qnt[categories_ids[i_cat]])
            #print('adding new category:', row_out)

            cat_file.write(';'.join(row_out) + '\n')

    crossparser_tools.write_to_log('Made scv of categories with ' +
                                   str(len(categories_names)) +
                                   ' items. Saved to ' + categs_filename)
    categs_for_menu = sorted(categs_for_menu,
                             key=lambda x: x[0],
                             reverse=False)
    #print(categs_for_menu)

    #make menu file for web-site
    with open(data_folder + 'category_menu.txt',
              'w+',
              newline='',
              encoding="utf8") as cat_file:
        for cat in categs_for_menu:
            cat_file.write(';'.join(cat) + '$$')
Beispiel #21
0
def parsnew():

    #Clear import catalog files (files of files)
    with open(file_of_raw_catalogs, 'w+', newline='',
              encoding="utf8") as files_toimport:
        files_toimport.close()

    global driver

    parse_websites()

    global websites_parsed

    for attr, value in websites.items():
        websites_parsed[attr] = False

    if credentials['is_server'] == 'no':

        options = webdriver.ChromeOptions()
        #prefs = {"download.default_directory" : temp_folder, "download.prompt_for_download": False, "download.directory_upgrade": True, "safebrowsing.enabled": True }
        temp_folder_downl = 'C:\\Work\\Crossparser\\temp'
        print(temp_folder_downl)
        prefs = {
            "download.default_directory": temp_folder_downl,
            "download.prompt_for_download": False,
            "download.directory_upgrade": True,
            "safebrowsing.enabled": True
        }
        options.add_experimental_option("prefs", prefs)
        chromedriver_path = config_folder + 'chromedriver.exe'
        options.add_argument('--window-size=1200,700')

        driver = webdriver.Chrome(chromedriver_path, chrome_options=options)

    #Unfortunately chrome doesnt saving any files in specific folder. Need to use Firefox for this only
    if credentials['is_server'] == 'yes':
        #chromedriver_path = config_folder + 'chromedriver'
        #options.add_argument('--no-sandbox')
        #options.add_argument("--disable-dev-shm-usage");
        #options.add_argument('--headless')
        #options.add_argument('--disable-gpu')

        profile = webdriver.FirefoxProfile()
        profile.set_preference('browser.download.folderList',
                               2)  # custom location
        profile.set_preference('browser.download.manager.showWhenStarting',
                               False)
        profile.set_preference('browser.download.dir', temp_folder)
        profile.set_preference('browser.helperApps.neverAsk.saveToDisk',
                               'text/csv')

        from selenium.webdriver.firefox.options import Options
        firefox_options = Options()
        firefox_options.add_argument('-headless')
        driver = webdriver.Firefox(firefox_profile=profile,
                                   options=firefox_options)

    cloud_login()
    start_parse_all()
    start_checking()

    crossparser_tools.write_to_log('Done with cloudparser')
    crossparser_tools.write_to_log('Totally links found: ' +
                                   str(counter_links_total))
    crossparser_tools.write_to_log('Successfully parsed: ' +
                                   str(counter_links_parsed))

    driver.quit()
Beispiel #22
0
if not os.path.exists(website_root + img_folder):
    os.makedirs(website_root + img_folder)

#Clear import catalog files (files of files)
with open(temp_folder + 'files_prod_import.txt',
          'w+',
          newline='',
          encoding="utf8") as files_toimport:
    files_toimport.close()
with open(temp_folder + 'files_categ_import.txt',
          'w+',
          newline='',
          encoding="utf8") as files_toimport:
    files_toimport.close()

crossparser_tools.write_to_log(
    '\n\n ********** Script started *********** \n\n')

#start_all_diggers()
#start_digger(digger_id)
#sleep(10)

#parse_new()

checking_all_diggers()

make_categories_csv()

crossparser_tools.write_to_log('parsing process completed')
crossparser_tools.write_to_log('totally parsed: ' + str(items_counter_parsed) +
                               ' items')
crossparser_tools.write_to_log('successfully converted: ' +
Beispiel #23
0
def image_check(img):
    global img_counter_existed
    global img_counter_dowloaded
    global img_counter_failed_to_dowload
    global img_counter_dowloaded_size
    global img_counter_dowloaded_size_compressed

    if img in img_db.keys():
        img_counter_existed += 1
        return img_db[img]

    try:
        #print('downloading img:', img)

        #img_name = crossparser_tools.get_uniqid_from_url(img, current_site)
        img_name = crossparser_tools.get_rand_uniqid(15)
        img_name = img_name + '.jpg'

        #file_path = website_root + img_folder + img_name
        file_path = img_folder + img_name

        print('downloading img: ' + img)
        print('save to: ' + file_path)

        urllib.request.urlretrieve(img, file_path)

        size = os.stat(file_path).st_size
        img_counter_dowloaded_size += size
        #print('size in: ' , size)

        compression = 100

        if size > 2000000:
            compression = 10
        if size > 1000000 and size < 2000000:
            compression = 20
        if size > 500000 and size < 1000000:
            compression = 50
        if size > 200000 and size < 500000:
            compression = 70

        if size > 200000:
            image = Image.open(file_path)
            image.save(file_path, quality=compression)
        else:
            compression = 0

        size = os.stat(file_path).st_size
        img_counter_dowloaded_size_compressed += size

        #print('compression: ' , compression)
        #print('size out: ' , size)

        img_db[img] = img_module_folder + img_name
        global global_prod_id
        img_db_products[img] = global_prod_id

        #Write to DB
        with open(data_folder + 'img_db', 'a+', newline='',
                  encoding="utf8") as img_dbfile:
            img_dbfile.write(img + '$$' + img_module_folder + img_name + '\n')

        with open(data_folder + 'img_db_prods',
                  'a+',
                  newline='',
                  encoding="utf8") as img_dbfile:
            img_dbfile.write(img_name + '$$' + global_prod_id + '\n')

        #Add to ElasticSearch
        if credentials['is_server'] == 'yes':
            try:
                image_match_add.add_img(file_path, global_prod_id)
            except Exception as e:
                crossparser_tools.write_to_log('unable to download img:', img)
                crossparser_tools.write_to_log(e)

        img_counter_dowloaded += 1

        return img_module_folder + img_name

    except Exception as e:
        print('unable to download img:', img)
        print(e)
        img_counter_failed_to_dowload += 1
        return ''