Esempi in Python per regex

Linguaggio di programmazione: Python

Spazio dei nomi/nome del pacchetto: service.ulitity

Metodo/funzione: regex

Esempi su hotexamples.com: 8

regex in Python: 8 esempi trovati. Questi sono i migliori esempi reali in Python per service.ulitity.regex, estratti da progetti open source. Li puoi valutare, per aiutarci a migliorare la qualità dei nostri esempi.

Esempio n. 1

Mostra file

File: laowa.py Progetto: YSRKEN/MFT-DB-Tool

def get_laowa_lens_list(scraping: IScrapingService) -> DataFrame:
    # レンズのURL一覧を取得する
    lens_list: List[Tuple[str, str]] = []
    page = scraping.get_page('https://www.laowa.jp/cat1/', cache=False)
    for div_element in page.find_all('div.product3'):
        h3_element = div_element.find('h3')
        if h3_element is None:
            continue
        a_element = div_element.find('a')
        if a_element is None:
            continue
        lens_name = h3_element.text
        lens_url = a_element.attrs['href']
        if 'LAOWA' in lens_name and 'mm' in lens_name:
            lens_list.append((lens_name, lens_url))

    # レンズの情報を取得する
    lens_raw_data_list: List[Dict[str, any]] = []
    for lens_name, lens_url in lens_list:
        page = scraping.get_page(lens_url)
        temp: Dict[str, str] = {'レンズ名': lens_name, 'URL': lens_url}
        section_element = page.find('div.productTable')
        if section_element is not None:
            for tr_element in section_element.find_all('tr'):
                td_elements = tr_element.find_all('td')
                if len(td_elements) < 2:
                    continue
                if td_elements[0].full_text is None or td_elements[1].full_text is None:
                    continue
                temp[td_elements[0].full_text] = td_elements[1].full_text
            # 特殊処理
            if temp['レンズ名'] == 'LAOWA 15mm F4 WIDE ANGLE MACRO':
                if 'Nikon' in temp['質量']:
                    # 記述が入れ替わっているので対策
                    temp2 = temp.copy()
                    temp2['マウント'] = temp['質量']
                    temp2['質量'] = temp['マウント']
                    temp = temp2
            lens_raw_data_list.append(temp)
    df = DataFrame.from_records(lens_raw_data_list)

    # 変換用に整形
    df['maker'] = 'LAOWA'
    df = convert_columns(df, {
        'レンズ名': 'name', 'URL': 'url', 'フォーマット': '対応フォーマット', '対応マウント': 'マウント',
        '寸法（鏡筒直径×長さ）': 'サイズ', '最小フォーカシングディスタンス': '最短撮影距離',
        '最大倍率比': '最大撮影倍率', '最大倍率': '最大撮影倍率',
        }, [
        '開放F値', '画角', 'レンズ構成', 'シフト機能', '最大イメージサークル', '絞り羽根枚数', 'フォーカス', 'JAN',
        '発売日', '絞り羽枚数', 'フォーカシング', 'フィルタースレッド', 'ワーキングディスタンス', '最大口径比',
        '絞り羽根枚数（F）', '絞り羽根枚数（T）', 'シフト量', '最小ワーキングディスタンス', '対応フォーマット',
    ])
    if '対応フォーマット' in df:
        del df['対応フォーマット']
    if None in df:
        del df[None]
    if '' in df:
        del df['']

    mount_list: List[str] = []
    for mount_temp in df['マウント']:
        if 'マイクロフォーサーズ' in mount_temp:
            mount_list.append('マイクロフォーサーズ')
        elif 'Leica L' in mount_temp:
            mount_list.append('ライカL')
        else:
            mount_list.append('')
    df['mount'] = mount_list
    df = df[df['mount'] != '']
    del df['マウント']

    w_list, t_list = extract_numbers(df['焦点距離'], [r'(\d+\.?\d*)-(\d+\.?\d*)mm'], [r'(\d+\.?\d*)mm'])
    w_list2: List[int] = []
    t_list2: List[int] = []
    for w, t, mount in zip(w_list, t_list, list(df['mount'])):
        if mount == 'マイクロフォーサーズ':
            w_list2.append(int((Decimal(w) * 2).quantize(Decimal('1'))))
            t_list2.append(int((Decimal(t) * 2).quantize(Decimal('1'))))
        elif mount == 'ライカL':
            w_list2.append(int(w))
            t_list2.append(int(t))
    df['wide_focal_length'] = w_list2
    df['telephoto_focal_length'] = t_list2
    del df['焦点距離']

    w, t = extract_numbers(df['name'], [r'F(\d+\.?\d*)-(\d+\.?\d*)'], [r'F(\d+\.?\d*)'])
    df['wide_f_number'] = [float(x) for x in w]
    df['telephoto_f_number'] = [float(x) for x in t]

    w_fd_list: List[int] = []
    t_fd_list: List[int] = []
    for fd in df['最短撮影距離'].values:
        result = regex(fd, r'(\d+.?\d*)mm～(\d+.?\d*)mm')
        if len(result) > 0:
            w_fd_list.append(int(result[0]))
            t_fd_list.append(int(result[1]))
            continue
        result = regex(fd, r'(\d+.?\d*)cm')
        if len(result) > 0:
            w_fd_list.append(int(Decimal(result[0]) * 10))
            t_fd_list.append(int(Decimal(result[0]) * 10))
            continue
        result = regex(fd, r'(\d+.?\d*)cｍ')
        if len(result) > 0:
            w_fd_list.append(int(Decimal(result[0]) * 10))
            t_fd_list.append(int(Decimal(result[0]) * 10))
            continue
        result = regex(fd, r'(\d+.?\d*)mm')
        if len(result) > 0:
            w_fd_list.append(int(result[0]))
            t_fd_list.append(int(result[0]))
            continue
        w_fd_list.append(0)
        t_fd_list.append(0)
    df['wide_min_focus_distance'] = w_fd_list
    df['telephoto_min_focus_distance'] = t_fd_list
    del df['最短撮影距離']

    mag_list: List[float] = []
    for val1, val2 in zip(df['最大撮影倍率'].values, df['mount'].values):
        value = Decimal(0)
        while True:
            if val1 != val1:
                break
            result = regex(val1, r'(\d+.?\d*)/(\d+.?\d*)倍')
            if len(result) > 0:
                value = Decimal(result[0]) / Decimal(result[1])
                break
            result = regex(val1, r'(\d+.?\d*):(\d+.?\d*)')
            if len(result) > 0:
                value = Decimal(result[0]) / Decimal(result[1])
                break
            result = regex(val1, r'(\d+.?\d*)倍')
            if len(result) > 0:
                value = Decimal(result[0])
                break
            result = regex(val1, r'(\d+.?\d*)')
            if len(result) > 0:
                value = Decimal(result[0])
                break
            break
        if val2 == 'マイクロフォーサーズ':
            mag_list.append(float(value * 2))
        else:
            mag_list.append(float(value))
    df['max_photographing_magnification'] = mag_list
    del df['最大撮影倍率']

    fd_list: List[float] = []
    for fd, name in zip(df['フィルター径'].values, df['name'].values):
        if fd != fd or name == 'LAOWA 10-18mm F4.5-5.6 FE ZOOM':
            fd_list.append(-1)
            continue
        result = regex(fd, r'(\d+.?\d*)mm')
        if len(result) == 0:
            fd_list.append(-1)
            continue
        fd_list.append(int(result[0]))
    df['filter_diameter'] = fd_list
    del df['フィルター径']

    df['is_drip_proof'] = False
    df['has_image_stabilization'] = False

    i: List[bool] = []
    for record in df.to_records():
        if record['wide_focal_length'] == record['telephoto_focal_length']:
            i.append(True)
            continue
        i.append(False)
    df['is_inner_zoom'] = i

    d, le = extract_numbers(df['サイズ'], [r'(\d+\.?\d*)[^\d]+(\d+\.?\d*)(mm|ｍｍ)'], [])
    df['overall_diameter'] = [float(x) for x in d]
    df['overall_length'] = [float(x) for x in le]
    del df['サイズ']

    weight: List[float] = []
    for f in df['質量']:
        result = regex(f, r'([\d,]+)(g|ｇ)')
        if len(result) > 0:
            weight.append(int(result[0].replace(',', '')))
        else:
            weight.append(-1)
    df['weight'] = weight
    del df['質量']

    df['price'] = 0
    return df

Esempio n. 2

Mostra file

def get_sigma_lens_list(scraping: IScrapingService) -> DataFrame:
    # レンズのURL一覧を取得する
    page = scraping.get_page('https://www.sigma-global.com/jp/lenses/',
                             cache=False)
    lens_list_mft: List[Tuple[str, str]] = []
    lens_list_l: List[Tuple[str, str]] = []
    for li_element in page.find('div.p-lens-search__main').find_all('li'):
        lens_link = li_element.find('a').attrs['href']
        if 'lenses' not in lens_link:
            continue
        h4_element = li_element.find('h4')
        if h4_element is None:
            continue
        lens_name = h4_element.text
        if 'micro-four-thirds' in li_element.attrs['data-lens-mount']:
            lens_list_mft.append((lens_name, lens_link))
        if 'l-mount' in li_element.attrs['data-lens-mount']:
            lens_list_l.append((lens_name, lens_link))
        # 特殊処理
        if '35mm F1.4 DG HSM' in lens_name:
            lens_list_l.append((lens_name, lens_link))

    page: DomObject = scraping.get_page(
        'https://www.sigma-global.com/jp/lenses/discontinued/', cache=False)
    lens_list_old: List[Tuple[str, str]] = []
    for li_element in page.find_all('li.p-support-service__item'):
        a_element = li_element.find('a')
        lens_link = a_element.attrs['href']
        lens_name = a_element.find('h4 > span').text
        lens_list_old.append((lens_name, lens_link))

    # レンズごとに情報を取得する
    lens_raw_data_list: List[Dict[str, any]] = []
    for lens_list, lens_mount in [(lens_list_mft, 'マイクロフォーサーズ'),
                                  (lens_list_l, 'ライカL')]:
        for lens_name, lens_link in lens_list:
            if 'lenses/c' in lens_link and '| Contemporary' not in lens_name:
                lens_name2 = lens_name + ' | Contemporary'
            elif 'lenses/a' in lens_link and '| Art' not in lens_name:
                lens_name2 = lens_name + ' | Art'
            else:
                lens_name2 = lens_name

            page = scraping.get_page(lens_link)
            temp_dict: Dict[str, str] = {
                'mount': lens_mount,
                'name': lens_name2,
                'url': lens_link
            }
            raw_dict = item_page_to_raw_dict(page, lens_mount)
            temp_dict.update(raw_dict)
            lens_raw_data_list.append(temp_dict)
    for lens_name, lens_link in lens_list_old:
        if 'DN' not in lens_name:
            # DNが含まれない＝ミラーレス用ではないので除外
            continue
        page = scraping.get_page(lens_link)
        temp_dict: Dict[str, str] = {
            'mount': 'マイクロフォーサーズ',
            'name': lens_name,
            'url': lens_link
        }
        temp_dict2 = item_page_to_raw_dict(page, '')
        if len(temp_dict2) > 0:
            temp_dict.update(temp_dict2)
            lens_raw_data_list.append(temp_dict)
    df = DataFrame.from_records(lens_raw_data_list)

    # 変換用に整形
    df['maker'] = 'SIGMA'
    df['product_number'] = df['エディションナンバー']
    del df['エディションナンバー']
    del df['レンズ構成枚数']
    del df['画角']
    del df['絞り羽根枚数']
    del df['最小絞り']
    del df['付属品']
    del df['対応マウント / バーコード']

    # focal_length
    w, t = extract_numbers(df['name'], [r'(\d+)-(\d+)mm'], [r'(\d+)mm'])
    wide_focal_length: List[int] = []
    telephoto_focal_length: List[int] = []
    for wf, tf, mount, name in zip(w, t, df['mount'], df['name']):
        if mount == 'マイクロフォーサーズ':
            wide_focal_length.append(int(wf) * 2)
            telephoto_focal_length.append(int(tf) * 2)
        else:
            if 'DC' in name:
                wide_focal_length.append(int(1.5 * int(wf)))
                telephoto_focal_length.append(int(1.5 * int(tf)))
            else:
                wide_focal_length.append(int(wf))
                telephoto_focal_length.append(int(tf))
    df['wide_focal_length'] = wide_focal_length
    df['telephoto_focal_length'] = telephoto_focal_length

    # f_number
    w, t = extract_numbers(df['name'], [r'F(\d+\.?\d*)-(\d+\.?\d*)'],
                           [r'F(\d+\.?\d*)'])
    df['wide_f_number'] = [float(x) for x in w]
    df['telephoto_f_number'] = [float(x) for x in t]

    # min_focus_distance
    w, t = extract_numbers(df['最短撮影距離'], [
        r'(\d+\.?\d*)-(\d+\.?\d*)cm', r'(\d+\.?\d*) \(W\)-(\d+\.?\d*) \(T\)cm',
        r'(\d+\.?\d*)\(W\) - (\d+\.?\d*)\(T\)cm',
        f'(\d+\.?\d*)（W）-(\d+\.?\d*)（T）cm'
    ], [r'(\d+\.?\d*)cm'])

    df['wide_min_focus_distance'] = [int(Decimal(x).scaleb(1)) for x in w]
    df['telephoto_min_focus_distance'] = [int(Decimal(x).scaleb(1)) for x in t]
    del df['最短撮影距離']

    # max_photographing_magnification
    m: List[float] = []
    for record in df.to_records():
        temp = regex(record['最大撮影倍率'].replace('：', ':'),
                     r'.*1:(\d+\.?\d*).*1:(\d+\.?\d*).*')
        if len(temp) > 0:
            if float(temp[0]) < float(temp[1]):
                denominator = temp[0]
            else:
                denominator = temp[1]
        else:
            temp = regex(record['最大撮影倍率'].replace('：', ':'),
                         r'.*1:(\d+\.?\d*).*')
            denominator = temp[0]
        if record['mount'] == 'マイクロフォーサーズ':
            m.append(
                float((Decimal('2') / Decimal(denominator)).quantize(
                    Decimal('0.01'))))
        else:
            if 'DC' in record['name']:
                m.append(
                    float((Decimal('1.5') / Decimal(denominator)).quantize(
                        Decimal('0.01'))))
            else:
                m.append(
                    float((Decimal('1') / Decimal(denominator)).quantize(
                        Decimal('0.01'))))
    df['max_photographing_magnification'] = m
    del df['最大撮影倍率']

    # filter_diameter
    filter_diameter: List[float] = []
    for f in df['フィルターサイズ']:
        if f == f:
            result = regex(f, r'(\d+.?\d*)mm')
            if len(result) > 0:
                filter_diameter.append(float(result[0]))
            else:
                filter_diameter.append(-1)
        else:
            filter_diameter.append(-1)
    df['filter_diameter'] = filter_diameter
    del df['フィルターサイズ']

    # is_drip_proof
    df['is_drip_proof'] = df['name'].map(lambda x: 'DC' in x or 'DG' in x)

    # has_image_stabilization
    df['has_image_stabilization'] = df['name'].map(lambda x: 'OS' in x)

    # is_inner_zoom
    is_inner_zoom: List[bool] = []
    for record in df.to_dict(orient='records'):
        is_inner_zoom.append(
            record['wide_focal_length'] == record['telephoto_focal_length'])
    df['is_inner_zoom'] = is_inner_zoom

    # overall_diameter, overall_length
    overall_diameter, overall_length = extract_numbers(
        df['最大径 × 長さ'], [r'(\d+\.?\d*)mm[^\d]*(\d+\.?\d*)mm'], [])
    for i in range(0, len(df)):
        # データが存在しない分については手動で埋める
        if df['name'].values[i] == '19mm F2.8 EX DN':
            overall_diameter[i] = '60.6'
            overall_length[i] = '45.7'
        elif df['name'].values[i] == '30mm F2.8 EX DN':
            overall_diameter[i] = '60.6'
            overall_length[i] = '38.6'
        elif df['name'].values[i] == '19mm F2.8 DN | Art':
            overall_diameter[i] = '60.8'
            overall_length[i] = '45.7'
        elif df['name'].values[i] == '30mm F2.8 DN | Art':
            overall_diameter[i] = '60.8'
            overall_length[i] = '40.5'
        elif df['name'].values[i] == '60mm F2.8 DN | Art':
            overall_diameter[i] = '60.8'
            overall_length[i] = '55.5'
    df['overall_diameter'] = [float(x) for x in overall_diameter]
    df['overall_length'] = [float(x) for x in overall_length]
    del df['最大径 × 長さ']

    # weight
    weight: List[float] = []
    for i in range(0, len(df)):
        f = df['質量'].values[i]
        if f != f:
            if df['name'].values[i] == '19mm F2.8 EX DN':
                weight.append(140)
            elif df['name'].values[i] == '30mm F2.8 EX DN':
                weight.append(130)
            elif df['name'].values[i] == '19mm F2.8 DN | Art':
                weight.append(160)
            elif df['name'].values[i] == '30mm F2.8 DN | Art':
                weight.append(140)
            elif df['name'].values[i] == '60mm F2.8 DN | Art':
                weight.append(190)
            continue
        result = regex(f, r'([\d,]+)g')
        if len(result) > 0:
            weight.append(int(result[0].replace(',', '')))
        else:
            weight.append(int(f))
    df['weight'] = weight
    del df['質量']

    # price
    price: List[float] = []
    for f in df['希望小売価格']:
        result = regex(f, r'([\d,]+) *円')
        if len(result) > 0:
            price.append(int(result[0].replace(',', '')))
        else:
            price.append(26240)  # アドホックな修正
    df['price'] = price
    del df['希望小売価格']

    return df

Esempio n. 3

Mostra file

def get_samyang_lens_list(scraping: IScrapingService) -> DataFrame:
    # レンズのURL一覧を取得する
    lens_list: List[Tuple[str, str, str]] = []
    page = scraping.get_page(
        'https://www.kenko-tokina.co.jp/camera-lens/samyang/', cache=False)
    for li_element in page.find_all('li.col.list_item'):
        lens_name = li_element.find('h3 > a').text
        lens_url = li_element.find('h3 > a').attrs['href']
        if 'data-spec3' in li_element.attrs:
            mount_info = li_element.attrs['data-spec3']
            if 'マイクロフォーサーズ' in mount_info:
                lens_list.append((lens_name, lens_url, 'マイクロフォーサーズ'))

    # レンズの情報を取得する
    temp_list: List[Dict[str, any]] = []
    for lens_name, lens_url, lens_mount in lens_list:
        page = scraping.get_page(lens_url)
        for table_element in page.find_all('table'):
            temp: Dict[str, any] = {
                'name': lens_name,
                'url': lens_url,
                'mount': lens_mount
            }
            for tr_element in table_element.find_all('tr'):
                th_element = tr_element.find('th')
                td_element = tr_element.find('td')
                if th_element.text == '大きさ' or th_element.text == '全長' or th_element.text == '質量' or th_element.text == '重さ':
                    # 大きさや質量はマウント毎に異なるので特殊処理を実施
                    temp2 = td_element.html.replace('<td>', '').replace(
                        '</td>', '').replace('\n', '').split('<br>')
                    temp3 = ''
                    for temp4 in temp2:
                        if 'マイクロフォーサーズ' in temp4:
                            temp3 = temp4
                            break
                    if temp3 == '':
                        for temp4 in temp2:
                            if 'ソニーE' in temp4:
                                temp3 = temp4
                                break
                        if temp3 == '':
                            temp3 = temp2[0]
                    temp[th_element.text] = temp3.replace('\n', '')
                else:
                    temp[th_element.text] = td_element.text
            if len(temp) > 0:
                temp_list.append(temp)
    df = DataFrame.from_records(temp_list)

    # 変換用に整形
    df['maker'] = 'SAMYANG'
    df['price'] = 0
    del df['レンズ構成']
    del df['レンズフード']
    del df['マウント']
    del df['JANコード']
    del df['フォーマットサイズ']
    del df['絞り羽根']
    del df['絞り羽根枚数']
    del df['付属品']
    del df['JANコード：']

    w, t = extract_numbers(df['焦点距離'], [], [r'(\d+\.?\d*)mm'])
    df['wide_focal_length'] = [int(Decimal(x) * 2) for x in w]
    df['telephoto_focal_length'] = [int(Decimal(x) * 2) for x in w]
    del df['焦点距離']
    del df['画角']

    w, t = extract_numbers(df['name'], [], [r'F(\d+\.?\d*)'])
    df['wide_f_number'] = [float(x) for x in w]
    df['telephoto_f_number'] = [float(x) for x in t]
    del df['明るさ']
    del df['絞り']

    w: List[int] = []
    t: List[int] = []
    for fd in df['最短撮影距離'].values:
        result = regex(fd, r'(\d+.?\d*)cm')
        if len(result) > 0:
            w.append(int(Decimal(result[0]) * 10))
            t.append(int(Decimal(result[0]) * 10))
            continue
        result = regex(fd, r'(\d+.?\d*)( *)m')
        if len(result) > 0:
            w.append(int(Decimal(result[0]) * 1000))
            t.append(int(Decimal(result[0]) * 1000))
            continue
        w.append(0)
        t.append(0)
    df['wide_min_focus_distance'] = w
    df['telephoto_min_focus_distance'] = t
    del df['最短撮影距離']

    mag_list: List[float] = []
    for val1, val2 in zip(df['最大撮影倍率'].values, df['mount'].values):
        value = Decimal(0)
        while True:
            if val1 != val1:
                break
            result = regex(val1, r'(\d+.?\d*)倍')
            if len(result) > 0:
                value = Decimal(result[0])
                break
            break
        if val2 == 'マイクロフォーサーズ':
            mag_list.append(float(value * 2))
        else:
            mag_list.append(float(value))
    df['max_photographing_magnification'] = mag_list
    del df['最大撮影倍率']

    fd_list: List[float] = []
    for record in df.to_dict(orient='records'):
        text = record['フィルターサイズ']
        if text != text:
            text = record['フィルター径']
        result = regex(text, r'(\d+.?\d*)mm')
        if len(result) == 0:
            fd_list.append(-1)
            continue
        fd_list.append(int(result[0]))
    df['filter_diameter'] = fd_list
    del df['フィルターサイズ']
    del df['フィルター径']

    df['is_drip_proof'] = False
    df['has_image_stabilization'] = False

    i: List[bool] = []
    for record in df.to_records():
        if record['wide_focal_length'] == record['telephoto_focal_length']:
            i.append(True)
            continue
        i.append(False)
    df['is_inner_zoom'] = i

    diameter_list: List[float] = []
    length_list: List[float] = []
    for record in df.to_dict(orient='records'):
        text = record['大きさ']
        if text == text:
            result = regex(text, r'(\d+.?\d*)[^\d.]+(\d+.?\d*) *mm')
            if len(result) != 2:
                diameter_list.append(-1)
                length_list.append(-1)
            else:
                length_list.append(float(result[0].replace('×', '')))
                diameter_list.append(float(result[1]))
        else:
            text = record['最大径']
            if text == text:
                result = regex(text, r'(\d+.?\d*)')
                if len(result) > 0:
                    diameter_list.append(float(result[0]))
                else:
                    diameter_list.append(-1)
            else:
                diameter_list.append(-1)

            text = record['全長']
            if text == text:
                result = regex(text, r'(\d+.?\d*)')
                if len(result) > 0:
                    length_list.append(float(result[0]))
                else:
                    length_list.append(-1)
            else:
                length_list.append(-1)
    df['overall_diameter'] = diameter_list
    df['overall_length'] = length_list
    del df['大きさ']
    del df['最大径']
    del df['全長']

    weight_list: List[float] = []
    for record in df.to_dict(orient='records'):
        text = record['質量']
        if text != text:
            text = record['重さ']
        result = regex(text, r'(\d+.?\d*)g')
        if len(result) == 0:
            weight_list.append(-1)
            continue
        weight_list.append(int(float(result[0]) + 0.5))
    df['weight'] = weight_list
    del df['質量']
    del df['重さ']
    return df

Esempio n. 4

Mostra file

def get_olympus_lens_list(scraping: IScrapingService) -> DataFrame:
    # レンズのURL一覧を取得する
    page = scraping.get_page(
        'https://www.olympus-imaging.jp/product/dslr/mlens/index.html',
        cache=False)
    lens_list: List[Tuple[str, str]] = []
    for a_element in page.find_all('h2.productName > a'):
        lens_name = a_element.text.split('/')[0].replace('\n', '')
        if 'M.ZUIKO' not in lens_name:
            continue
        lens_product_number = a_element.attrs['href'].replace(
            '/product/dslr/mlens/', '').replace('/index.html', '')
        lens_list.append((lens_name, lens_product_number))

    page = scraping.get_page(
        'https://www.olympus-imaging.jp/product/dslr/record/index.html',
        cache=False)
    for a_element in page.find_all('div.section'):
        div_element = a_element.find('div.mb15 > h2')
        a_element2 = a_element.find('li > a')
        if div_element is None or a_element2 is None:
            continue
        lens_name = div_element.text
        if 'M.ZUIKO DIGITAL' not in lens_name:
            continue
        lens_product_number = a_element2.attrs['href'].replace(
            '/product/dslr/mlens/', '').replace('/index.html', '')
        lens_list.append((lens_name, lens_product_number))

    # レンズごとに情報を取得する
    lens_data_list: List[Dict[str, str]] = []
    for lens_name, lens_product_number in lens_list:
        # 詳細ページから情報を取得する
        if lens_product_number != '14-42_35-56':
            spec_url = f'https://www.olympus-imaging.jp/product/dslr/mlens/{lens_product_number}/spec.html'
        else:
            spec_url = f'https://www.olympus-imaging.jp/product/dslr/mlens/{lens_product_number}/spec/index.html'
        page = scraping.get_page(spec_url)
        temp_dict: Dict[str, str] = {}
        for tr_element in page.find('table').find_all('tr'):
            tr_element: DomObject = tr_element

            # th側は、spanで囲まれてたりstrongで囲まれてたりするクソ仕様なので、力技で解決させた
            th_element = tr_element.find('th > span')
            if th_element is None:
                th_element = tr_element.find('th > strong')
            if th_element is None:
                th_element = tr_element.find('th')

            # td側はそのまま
            td_element = tr_element.find('td')

            # 合体
            temp_dict[th_element.text] = td_element.text

        # 製品トップページから情報を取得する
        index_url = f'https://www.olympus-imaging.jp/product/dslr/mlens/{lens_product_number}/index.html'
        page = scraping.get_page(index_url)
        temp_dict['URL'] = index_url
        table_element = page.find('table')
        # 詳細ページとはth・tdの拾い方を変えているのは、
        # M.ZUIKO DIGITAL ED 30mm F3.5 Macroの製品トップページの時のみ、
        # 希望小売価格「だけ」が取得できない不具合があったため
        for th_element, td_element in zip(table_element.find_all('th'),
                                          table_element.find_all('td')):
            th_element2 = th_element.find('span')
            if th_element2 is None:
                th_element2 = th_element.find('strong')
            if th_element2 is None:
                th_element2 = th_element
            temp_dict[th_element2.text] = td_element.text

        # 必要な列を追加
        temp_dict['name'] = lens_name.replace('　', ' ')
        temp_dict['product_number'] = lens_product_number

        # 不要な列を削除
        del_column_list = [
            'レンズ構成',
            'フォーカシング方式',
            'AF方式',
            '特長',
            'マウント規格',
            '画角',
            '最近接撮影範囲',
            '絞り羽枚数',
            '同梱品',
            '主な同梱品',
            '別売りアクセサリー',
            '別売アクセサリー',
            '製品名',
            'JANコード',
            'JAN',
            '発売日',
            'オンラインショップ',
            'フード',
            '最大口径比',
            '最小口径比',
            '最大口径比／最小口径比',
            '35mm判換算最大撮影倍率',
            '最大撮影倍率（35mm判換算）',
            '手ぶれ補正性能',
            'ズーム',
            'ズーム方式',
            '35mm判換算',
            '絞り範囲',
        ]
        for column in del_column_list:
            if column in temp_dict:
                del temp_dict[column]

        # 一部列だけ列名を変更しないと結合できないので対処
        if '大きさ　最大径×長さ' in temp_dict:
            temp_dict['大きさ 最大径×全長'] = temp_dict['大きさ　最大径×長さ']
            del temp_dict['大きさ　最大径×長さ']
        if '大きさ　最大径 × 全長' in temp_dict:
            temp_dict['大きさ 最大径×全長'] = temp_dict['大きさ　最大径 × 全長']
            del temp_dict['大きさ　最大径 × 全長']
        if '大きさ　最大径×全長' in temp_dict:
            temp_dict['大きさ 最大径×全長'] = temp_dict['大きさ　最大径×全長']
            del temp_dict['大きさ　最大径×全長']
        if '大きさ 最大径 x 全長' in temp_dict:
            temp_dict['大きさ 最大径×全長'] = temp_dict['大きさ 最大径 x 全長']
            del temp_dict['大きさ 最大径 x 全長']
        if '防滴性能 / 防塵機構' in temp_dict:
            temp_dict['防滴処理'] = temp_dict['防滴性能 / 防塵機構']
            del temp_dict['防滴性能 / 防塵機構']
        if '防滴性能／防塵機構' in temp_dict:
            temp_dict['防滴処理'] = temp_dict['防滴性能／防塵機構']
            del temp_dict['防滴性能／防塵機構']
        if '防滴性能 / 防塵機構搭載' in temp_dict:
            temp_dict['防滴処理'] = temp_dict['防滴性能 / 防塵機構搭載']
            del temp_dict['防滴性能 / 防塵機構搭載']
        if '価格' in temp_dict:
            temp_dict['希望小売価格'] = temp_dict['価格']
            del temp_dict['価格']
        lens_data_list.append(temp_dict)

    df = DataFrame.from_records(lens_data_list)

    # 変換用に整形
    df['maker'] = 'OLYMPUS'

    # focal_length
    w, t = extract_numbers(df['焦点距離'], [r'(\d+)-(\d+)mm', r'(\d+) - (\d+)mm'],
                           [r'(\d+)mm'])
    df['wide_focal_length'] = [int(x) * 2 for x in w]
    df['telephoto_focal_length'] = [int(x) * 2 for x in t]
    # M.ZUIKO DIGITAL　ED 150-400mm F4.5 TC1.25x IS PROは内蔵テレコンを持つので、その対策
    with warnings.catch_warnings():
        warnings.simplefilter('ignore', SettingWithCopyWarning)
        df.telephoto_focal_length[df.product_number ==
                                  '150-400_45ispro'] = 1000
    del df['焦点距離']

    # f_number
    w, t = extract_numbers(df['name'], [r'F(\d+\.?\d*)-(\d+\.?\d*)'],
                           [r'F(\d+\.?\d*)'])
    df['wide_f_number'] = [float(x) for x in w]
    df['telephoto_f_number'] = [float(x) for x in t]

    # min_focus_distance
    w, t = extract_numbers(df['最短撮影距離'], [
        r'(\d+\.?\d+)m（.+） / (\d+\.?\d+)m（.+）',
        r'(\d+\.?\d+)m \(.+\) / (\d+\.?\d+)m \(.+\)',
        r'(\d+\.?\d+)m.+／(\d+\.?\d+)m.+'
    ], [r'(\d+\.?\d+)m', r'(\d+\.?\d+) m'])
    df['wide_min_focus_distance'] = [int(Decimal(x).scaleb(3)) for x in w]
    df['telephoto_min_focus_distance'] = [int(Decimal(x).scaleb(3)) for x in t]
    del df['最短撮影距離']

    # max_photographing_magnification
    w, t = extract_numbers(df['最大撮影倍率'], [
        r'(\d+\.?\d+)倍 \(Wide\) ／ (\d+\.?\d+)倍 \(Tele\)',
        r'Wide：(\d+\.?\d+)倍／Tele:(\d+\.?\d+)倍',
        r'(\d+\.?\d+)倍（Wide） / (\d+\.?\d+)倍（Tele）',
        r'Wide：(\d+\.?\d+)倍／Tele：(\d+\.?\d+)倍',
        r'(\d+\.?\d+)倍（Wide）/ (\d+\.?\d+)倍（Tele）'
    ], [
        r'(\d+\.?\d+)倍（35mm判換算\d+\.?\d+倍相当）',
        r'(\d+\.?\d+)倍（35mm判換算 \d+\.?\d+倍相当）', r'(\d+\.?\d+)倍（Wide / Tele）',
        r'(\d+\.?\d+)倍 \(35mm判換算 \d+\.?\d+倍相当\)',
        r'(\d+\.?\d+)倍（35mm判換算 \d+\.?\d+倍）',
        r'(\d+\.?\d+)倍（マクロモード時）（35mm判換算 \d+\.?\d+倍）'
    ])
    m: List[float] = []
    for a, b, text in zip(w, t, df['最大撮影倍率'].values):
        if a == b:
            mm = float(Decimal(a) * 2)
        elif '換算' in text:
            mm = max(float(Decimal(a)), float(Decimal(b)))
        else:
            mm = max(float(Decimal(a)), float(Decimal(b))) * 2
        m.append(mm)
    df['max_photographing_magnification'] = m
    del df['最大撮影倍率']

    # filter_diameter
    filter_diameter: List[float] = []
    for f in df['フィルターサイズ']:
        if f != f:
            filter_diameter.append(-1)
            continue
        result = regex(f, r'(\d+.?\d*)mm')
        if len(result) > 0:
            filter_diameter.append(float(result[0]))
        else:
            filter_diameter.append(-1)
    df['filter_diameter'] = filter_diameter
    del df['フィルターサイズ']

    # is_drip_proof
    df['is_drip_proof'] = df['防滴処理'].map(lambda x: x == x and x != '')
    del df['防滴処理']

    # has_image_stabilization
    df['has_image_stabilization'] = df['name'].map(lambda x: 'IS' in x)
    del df['レンズ内手ぶれ補正機構']

    # is_inner_zoom
    i: List[bool] = []
    for record in df.to_records():
        if record['wide_focal_length'] == record['telephoto_focal_length']:
            i.append(True)
            continue
        if record['product_number'] in [
                '7-14_28pro', '40-150_28pro', '150-400_45ispro'
        ]:
            i.append(True)
            continue
        i.append(False)
    df['is_inner_zoom'] = i

    # overall_diameter, overall_length
    d, le = extract_numbers(df['大きさ 最大径×全長'], [
        r'φ(\d+.?\d*)x(\d+.?\d*)mm', r'Ø(\d+.?\d*)×(\d+.?\d*)mm',
        r'Φ (\d+.?\d*) mm  ｘ (\d+.?\d*) mm', r'⌀(\d+.?\d*) x (\d+.?\d*)mm',
        r'Ø(\d+.?\d*) × (\d+.?\d*)mm', r'Ø(\d+.?\d*) x (\d+.?\d*)mm',
        r'Ø(\d+.?\d*)mm x (\d+.?\d*)mm', r'Ø(\d+.?\d*)x (\d+.?\d*)mm',
        r'Ø(\d+.?\d*)x(\d+.?\d*)mm', r'⌀(\d+.?\d*)×(\d+.?\d*)mm',
        r'Ø(\d+.?\d*)mm × (\d+.?\d*)mm', r'φ(\d+.?\d*)×(\d+.?\d*)mm'
    ], [])
    df['overall_diameter'] = [float(x) for x in d]
    df['overall_length'] = [float(x) for x in le]
    del df['大きさ 最大径×全長']

    # weight
    weight: List[float] = []
    for f in df['質量']:
        result = regex(f, r'([\d,]+)[^\d]*(g|ｇ)')
        if len(result) > 0:
            weight.append(int(result[0].replace(',', '')))
        else:
            weight.append(-1)
    df['weight'] = weight
    del df['質量']

    # price
    price: List[float] = []
    for f in df['希望小売価格']:
        result = regex(f, r'([\d,]+)円')
        if len(result) > 0:
            price.append(int(result[0].replace(',', '')))
        else:
            price.append(-1)
    df['price'] = price
    del df['希望小売価格']

    # mount・url
    df['mount'] = 'マイクロフォーサーズ'
    df['url'] = df['URL']
    del df['URL']
    return df

Esempio n. 5

Mostra file

def get_panasonic_old_lens_list(scraping: IScrapingService) -> DataFrame:
    # 情報ページを開く
    page = scraping.get_page(
        'https://panasonic.jp/dc/products/g_series_lens.html', cache=False)

    # 情報URLの一覧を取得する
    link_url_set: Set[str] = set()
    for a_element in page.find_all('a'):
        link_url = a_element.attrs['href']
        if len(regex(link_url,
                     r'(http://panasonic\.jp/dc/p-db/.+\.html)')) > 0:
            link_url_set.add(link_url)

    # 順番に取得する
    temp_list: List[Dict[str, any]] = []
    for link_url in link_url_set:
        page = scraping.get_page(link_url.replace('.html', '_spec.html'))
        table_element = page.find('table')
        temp_dict: Dict[str, any] = {}
        temp_dict['リンク'] = link_url
        temp_dict['型番'] = regex(link_url,
                                r'http://panasonic\.jp/dc/p-db/(.+)\.html')[0]
        for th_element, td_element in zip(table_element.find_all('th'),
                                          table_element.find_all('td')):
            temp_dict[th_element.text] = td_element.text
        temp_list.append(temp_dict)
    df = DataFrame.from_records(temp_list)

    # 変換用に整形
    df['maker'] = 'Panasonic'

    df['name'] = df['レンズ名称']
    del df['レンズ名称']

    df['product_number'] = df['型番']
    del df['型番']

    w, t = extract_numbers(df['焦点距離'], [r'(\d+)mm～(\d+)mm', r'(\d+)-(\d+)mm'],
                           [r'(\d+)mm'])
    df['wide_focal_length'] = [int(x) * 2 for x in w]
    df['telephoto_focal_length'] = [int(x) * 2 for x in t]
    del df['焦点距離']

    w, t = extract_numbers(df['name'], [r'F(\d+\.?\d*)-(\d+\.?\d*)'],
                           [r'F(\d+\.?\d*)'])
    df['wide_f_number'] = [float(x) for x in w]
    df['telephoto_f_number'] = [float(x) for x in t]
    del df['開放絞り']
    del df['絞り形式']
    del df['最小絞り']

    w, t = extract_numbers(
        df['最短撮影距離'],
        [r'(\d+\.?\d+)m / (\d+\.?\d+)m', r'(\d+\.?\d+)m～∞.*(\d+\.?\d+)m～∞'],
        [r'(\d+\.?\d+)m', r'(\d+\.?\d+)m～∞'])
    df['wide_min_focus_distance'] = [int(Decimal(x).scaleb(3)) for x in w]
    df['telephoto_min_focus_distance'] = [int(Decimal(x).scaleb(3)) for x in t]
    del df['最短撮影距離']

    df['max_photographing_magnification'] = 0.0  # なぜか記載がなかったので

    filter_diameter: List[float] = []
    for f in df['フィルター径']:
        result = regex(f, r'(\d+.?\d*)mm')
        if len(result) > 0:
            filter_diameter.append(float(result[0]))
        else:
            filter_diameter.append(-1)
    df['filter_diameter'] = filter_diameter
    del df['フィルター径']

    df['is_drip_proof'] = False  # なぜか記載がなかったので

    df['has_image_stabilization'] = df['name'].map(lambda x: 'O.I.S.' in x)

    i: List[bool] = []
    for record in df.to_records():
        if record['wide_focal_length'] == record['telephoto_focal_length']:
            i.append(True)
            continue
        if record['product_number'] in [
                'H-F007014', 'H-E08018', 'H-PS45175', 'S-E70200', 'S-R70200'
        ]:
            i.append(True)
            continue
        i.append(False)
    df['is_inner_zoom'] = i

    d, le = extract_numbers(df['外形寸法'], [r'(\d+\.?\d*)mm[^\d]*(\d+\.?\d*)mm'],
                            [])
    df['overall_diameter'] = [float(x) for x in d]
    df['overall_length'] = [float(x) for x in le]
    del df['外形寸法']

    weight: List[float] = []
    for f in df['質量']:
        result = regex(f, r'([\d,]+)g')
        if len(result) > 0:
            weight.append(int(result[0].replace(',', '')))
        else:
            weight.append(-1)
    df['weight'] = weight
    del df['質量']

    df['price'] = 0  # なぜか記載がなかったので

    df['mount'] = 'マイクロフォーサーズ'
    del df['レンズ構成']
    del df['マウント']
    df['url'] = df['リンク']
    del df['リンク']

    return df

Esempio n. 6

Mostra file

def get_panasonic_lens_list(scraping: IScrapingService) -> DataFrame:
    # 情報ページを開く
    page = scraping.get_page('https://panasonic.jp/dc/comparison.html',
                             cache=False)

    # tableタグからデータを収集する
    df1 = DataFrame()
    for table_element in page.find_all('table'):
        if 'LUMIX G' not in table_element.full_text:
            continue
        df1['レンズ名'] = [
            cleansing(x.text) for x in table_element.find_all('th p')
        ]
        df1['URL'] = [
            'https://panasonic.jp' + x.attrs['href']
            for x in table_element.find_all('th a')
        ]
        for tr_element in table_element.find_all('tbody > tr'):
            key = cleansing(tr_element.find('th').text)
            value = [cleansing(x.text) for x in tr_element.find_all('td')]
            df1[key] = value
        break

    df2 = DataFrame()
    for table_element in page.find_all('table'):
        if 'LUMIX S' not in table_element.full_text:
            continue
        df2['レンズ名'] = [
            cleansing(x.text) for x in table_element.find_all('th p')
        ]
        df2['URL'] = [
            'https://panasonic.jp' + x.attrs['href']
            for x in table_element.find_all('th a')
        ]
        for tr_element in table_element.find_all('tbody > tr'):
            if tr_element.find('th') is None:
                continue
            key = cleansing(tr_element.find('th').text)
            value = [cleansing(x.text) for x in tr_element.find_all('td')]
            df2[key] = value
        # なぜか、「最大径×全長」だけ記述位置が異なるので対策
        key = cleansing(table_element.find('tbody > th').text)
        value = [
            cleansing(x.text) for x in table_element.find_all('tbody > td')
        ]
        df2[key] = value
        break

    # データを加工し、結合できるように整える
    df1 = convert_columns(
        df1, {
            'レンズ名': 'name',
            'URL': 'url',
            '品番': 'product_number',
            '35mm判換算焦点距離': 'focal_length',
            '最短撮影距離': 'min_focus_distance',
            '最大撮影倍率': 'max_photographing_magnification',
            '手ブレ補正': 'has_image_stabilization',
            'フィルターサイズ': 'filter_diameter',
            '最大径×全長': 'overall_size',
            '質量': 'weight',
            '防塵・防滴': 'is_drip_proof',
            'メーカー希望小売価格': 'price',
        }, [
            'レンズ構成',
            '絞り羽根 / 形状',
            '最小絞り値',
            'レンズコーティング',
            '対角線画角',
            'レンズキャップ',
        ])
    df1['mount'] = 'マイクロフォーサーズ'

    df2 = convert_columns(
        df2, {
            'レンズ名': 'name',
            'URL': 'url',
            '品番': 'product_number',
            '焦点距離': 'focal_length',
            '撮影距離範囲': 'min_focus_distance',
            '手ブレ補正': 'has_image_stabilization',
            'フィルター径': 'filter_diameter',
            '防塵・防滴': 'is_drip_proof',
            '最大撮影倍率': 'max_photographing_magnification',
            '最大径×全長': 'overall_size',
            '質量': 'weight',
            'メーカー希望小売価格': 'price'
        }, [
            'レンズ構成',
            'マウント',
            '絞り羽根 / 形状',
            '開放絞り',
            '最小絞り',
        ])
    df2['mount'] = 'ライカL'

    # 結合
    df = pandas.concat([df1, df2])

    # 変換用に整形
    df['maker'] = 'Panasonic'

    # focal_length
    w, t = extract_numbers(df['focal_length'],
                           [r'(\d+)mm～(\d+)mm', r'(\d+)-(\d+)mm'],
                           [r'(\d+)mm'])
    df['wide_focal_length'] = [int(x) for x in w]
    df['telephoto_focal_length'] = [int(x) for x in t]
    del df['focal_length']

    # f_number
    w, t = extract_numbers(df['name'], [r'F(\d+\.?\d*)-(\d+\.?\d*)'],
                           [r'F(\d+\.?\d*)'])
    df['wide_f_number'] = [float(x) for x in w]
    df['telephoto_f_number'] = [float(x) for x in t]

    # min_focus_distance
    w, t = extract_numbers(
        df['min_focus_distance'],
        [r'(\d+\.?\d+)m / (\d+\.?\d+)m', r'(\d+\.?\d+)m～∞.*(\d+\.?\d+)m～∞'],
        [r'(\d+\.?\d+)m', r'(\d+\.?\d+)m～∞'])
    df['wide_min_focus_distance'] = [int(Decimal(x).scaleb(3)) for x in w]
    df['telephoto_min_focus_distance'] = [int(Decimal(x).scaleb(3)) for x in t]
    del df['min_focus_distance']

    # max_photographing_magnification
    m: List[float] = []
    for record in df.to_records():
        temp = record['max_photographing_magnification'].replace('倍', '')
        if record['mount'] == 'マイクロフォーサーズ':
            m.append(float(Decimal(temp) * 2))
        else:
            m.append(float(temp))
    df['max_photographing_magnification'] = m

    # filter_diameter
    filter_diameter: List[float] = []
    for f in df['filter_diameter']:
        result = regex(f, r'(\d+.?\d*)mm')
        if len(result) > 0:
            filter_diameter.append(float(result[0]))
        else:
            filter_diameter.append(-1)
    df['filter_diameter'] = filter_diameter

    # is_drip_proof
    df['is_drip_proof'] = df['is_drip_proof'].map(lambda x: x == '○')

    # has_image_stabilization
    df['has_image_stabilization'] = df['has_image_stabilization'].map(
        lambda x: x != '－')

    # is_inner_zoom
    i: List[bool] = []
    for record in df.to_records():
        if record['wide_focal_length'] == record['telephoto_focal_length']:
            i.append(True)
            continue
        if record['product_number'] in [
                'H-F007014', 'H-E08018', 'H-PS45175', 'S-E70200', 'S-R70200'
        ]:
            i.append(True)
            continue
        i.append(False)
    df['is_inner_zoom'] = i

    # overall_diameter, overall_length
    d, le = extract_numbers(df['overall_size'],
                            [r'(\d+\.?\d*)mm[^\d]*(\d+\.?\d*)mm'], [])
    df['overall_diameter'] = [float(x) for x in d]
    df['overall_length'] = [float(x) for x in le]
    del df['overall_size']

    # weight
    weight: List[float] = []
    for f in df['weight']:
        result = regex(f, r'([\d,]+)g')
        if len(result) > 0:
            weight.append(int(result[0].replace(',', '')))
        else:
            weight.append(-1)
    df['weight'] = weight

    # price
    price: List[float] = []
    for f in df['price']:
        result = regex(f, r'([\d,]+) *円')
        if len(result) > 0:
            price.append(int(result[0].replace(',', '')))
        else:
            price.append(-1)
    df['price'] = price
    return df

Esempio n. 7

Mostra file

def get_leica_lens_list(scraping: IScrapingService) -> DataFrame:
    # レンズのURL一覧を取得する
    lens_list: List[Tuple[str, str]] = []
    page_index = 0
    while True:
        page_url = f'https://leica-camera.com/en-US/photography/lenses/sl?field_pim_categories=&page={page_index}'
        page = scraping.get_page(page_url, cache=False)
        article_elements = page.find_all('article.content-teasers-item')
        if len(article_elements) == 0:
            break
        page_index += 1
        for article_element in article_elements:
            lens_name = article_element.find('div.field--name-external-field-main-product-title'
                                             ).full_text
            if 'SL' not in lens_name:
                continue
            if 'Leica' not in lens_name:
                continue
            if 'hood' in lens_name:
                continue
            lens_url = 'https://leica-camera.com' + article_element.find('a.node-link').attrs['href']
            lens_list.append((lens_name, lens_url))

    # レンズの生情報を取得する
    lens_raw_data_list: List[Dict[str, any]] = []
    for lens_name, lens_url in lens_list:
        page = scraping.get_page(lens_url)
        temp: Dict[str, str] = {'レンズ名': lens_name, 'URL': lens_url}
        for tr_element in page.find_all('tr'):
            th_elements = tr_element.find_all('th')
            td_elements = tr_element.find_all('td')
            if len(td_elements) >= 2:
                temp[td_elements[0].text] = td_elements[1].text
            elif len(th_elements) >= 1 and len(td_elements) >= 1:
                temp[th_elements[0].text] = td_elements[0].text
            else:
                continue
        lens_raw_data_list.append(temp)
    df = DataFrame.from_records(lens_raw_data_list)

    # 変換用に整形
    df['maker'] = 'LEICA'
    df['mount'] = 'ライカL'
    df = convert_columns(df, {
        'レンズ名': 'name',
        'URL': 'url',
        'Order Number': 'Order number',
        'Largest scale': 'Largest reproduction ratio',
        'Filter thread': 'Filter mount',
        'Length': 'Length to bayonet mount',
        'Diameter': 'Largest diameter',
        'Black, anodized': 'Order number',
        'Length to bayonet flange': 'Length to bayonet mount',
        'Focus range': 'Working range',
    }, [
        'Field angle (diagonal, horizontal, vertical)',
        'Number of lenses/groups',
        'Number of asph. surfaces / lenses',
        'Entrance pupil position',
        'Smallest object field',
        'Setting/function',
        'Aperture setting range',
        'Lowest value',
        'Bayonet/sensor format',
        'View angle (diagonal/horizontal/vertical) Full-frame (24 × 36 mm)',
        'Number of lenses/assemblies',
        'Number of aspherical surfaces',
        'Position of the entrance pupil before the bayonet',
        'Setting',
        'Setting/Function',
        'Smallest aperture',
        'Bayonet',
        'Lens hood',
        'Full-frame (24 × 36 mm)',
        'Angle of view (diagonal, horizontal, vertical)',
        'Number of elements/groups',
        'Position of entrance pupil',
        'Smallest value',
        'Number of aspherical lenses',
    ])

    # product_number
    df['product_number'] = df['Order number'].map(lambda x: str(x).replace(' ', ''))
    del df['Order number']

    # wide_focal_length, telephoto_focal_length
    w, t = extract_numbers(df['name'], [r'SL (\d+)-(\d+)mm f', r'SL(\d+)-(\d+) f', r'SL (\d+)-(\d+) f'],
                           [r'SL(\d+) f', r'SL (\d+) f', r'SL 1:\d+\.?\d*/(\d+)'])
    df['wide_focal_length'] = w
    df['telephoto_focal_length'] = t

    # wide_f_number, telephoto_f_number
    w, t = extract_numbers(df['name'], [r'f/(\d+\.?\d*)-(\d+\.?\d*)'],
                           [r'f/(\d+\.?\d*)', r'SL 1:(\d+\.?\d*)/\d+'])
    df['wide_f_number'] = w
    df['telephoto_f_number'] = t

    # wide_min_focus_distance, telephoto_min_focus_distance
    w: List[int] = []
    t: List[int] = []
    for record in list(df['Working range'].values):
        match_result = regex(record, r'(\d+,\d*) m to infinity.+(\d+,\d*) m to infinity')
        if len(match_result) > 0:
            w.append(int(Decimal(match_result[0].replace(',', '.')).scaleb(3)))
            t.append(int(Decimal(match_result[1].replace(',', '.')).scaleb(3)))
            continue
        match_result = regex(record, r'(\d+\.?\d*) m to infinity')
        if len(match_result) > 0:
            w.append(int(Decimal(match_result[0]).scaleb(3)))
            t.append(int(Decimal(match_result[0]).scaleb(3)))
            continue
        match_result = regex(record, r'∞ to (\d+\.?\d*) m')
        if len(match_result) > 0:
            w.append(int(Decimal(match_result[0]).scaleb(3)))
            t.append(int(Decimal(match_result[0]).scaleb(3)))
            continue
        match_result = regex(record, r'(\d+\.?\d*)mm to infinity')
        if len(match_result) > 0:
            w.append(int(match_result[0]))
            t.append(int(match_result[0]))
            continue
        w.append(0)
        t.append(0)
    df['wide_min_focus_distance'] = w
    df['telephoto_min_focus_distance'] = t
    del df['Working range']

    # max_photographing_magnification
    m: List[float] = []
    for record in df.iterrows():
        series: Series = record[1]
        denominator = regex(series['Largest reproduction ratio'].replace(',', '.'), r'1:(\d+\.?\d*)')
        m.append(float((Decimal('1') / Decimal(denominator[0])).quantize(Decimal('0.01'))))
    df['max_photographing_magnification'] = m
    del df['Largest reproduction ratio']

    # filter_diameter
    df['filter_diameter'] = df['Filter mount'].map(lambda x: int(str(x).replace('E', '')))
    del df['Filter mount']

    # is_drip_proof, has_image_stabilization, is_inner_zoom
    is_drip_proof = []
    has_image_stabilization = []
    is_inner_zoom = []
    for record in df.iterrows():
        record = record[1]
        is_drip_proof.append(False)
        if record['O.I.S. Performance as per CIPA']:
            has_image_stabilization.append(True)
        else:
            has_image_stabilization.append(False)
        if record['name'] in ['Leica APO-Vario-Elmarit-SL 90-280 f/2.8-4'] or \
                record['wide_focal_length'] == record['telephoto_focal_length']:
            is_inner_zoom.append(True)
        else:
            is_inner_zoom.append(False)
    df['is_drip_proof'] = is_drip_proof
    df['has_image_stabilization'] = has_image_stabilization
    df['is_inner_zoom'] = is_inner_zoom
    del df['O.I.S. Performance as per CIPA']

    # overall_diameter, overall_length
    overall_diameter = []
    overall_length = []
    for record in df.iterrows():
        record = record[1]
        if '/' in record['Largest diameter']:
            diameter = regex(record['Largest diameter'].replace('\u2009', ' '), r'(\d+\.?\d*)/\d+ mm')
        elif ':' in record['Largest diameter']:
            diameter = regex(record['Largest diameter'].replace('\u2009', ' '), r': (\d+\.?\d*) mm')
        else:
            diameter = regex(record['Largest diameter'].replace('\u2009', ' '), r'(\d+\.?\d*) mm')
        if '/' in record['Length to bayonet mount']:
            length = regex(record['Length to bayonet mount'].replace('\u2009', ' '), r'(\d+\.?\d*)/\d+ mm')
        elif ':' in record['Length to bayonet mount']:
            length = regex(record['Length to bayonet mount'].replace('\u2009', ' '), r': (\d+\.?\d*) mm')
        else:
            length = regex(record['Length to bayonet mount'].replace('\u2009', ' '), r'(\d+\.?\d*) mm')
        overall_diameter.append(float(diameter[0]))
        overall_length.append(float(length[0]))
    df['overall_diameter'] = overall_diameter
    df['overall_length'] = overall_length
    del df['Largest diameter']
    del df['Length to bayonet mount']

    # weight
    weight: List[float] = []
    for i in range(0, len(df)):
        f = df['Weight'].values[i].replace('\u2009', ' ')
        result = regex(f, r'([\d.]+) g')
        if len(result) > 0:
            result2 = regex(f, r'([\d.]+)/[\d.]+ g')
            if len(result2) > 0:
                weight.append(int(result2[0].replace('.', '')))
            else:
                weight.append(int(result[0].replace('.', '')))
        else:
            weight.append(int(f))
    df['weight'] = weight
    del df['Weight']

    df['price'] = 0
    return df

Esempio n. 8

Mostra file

File: cosina.py Progetto: YSRKEN/MFT-DB-Tool

def get_cosina_lens_list(scraping: IScrapingService) -> DataFrame:
    # レンズのURL一覧を取得する
    lens_list: List[Tuple[str, str]] = []
    page = scraping.get_page(
        'http://www.cosina.co.jp/seihin/voigtlander/mft-mount/index.html',
        encoding='cp932',
        cache=False)
    for a_element in page.find_all('td > a'):
        lens_name = a_element.find('img').attrs['alt']
        lens_url = 'http://www.cosina.co.jp/seihin/voigtlander/mft-mount/' + a_element.attrs[
            'href']
        if 'mm' in lens_name and 'mft' in a_element.attrs['href']:
            if lens_name not in lens_name_table:
                print(lens_name)
                raise Exception('未対応のレンズが含まれています')
            lens_list.append((lens_name_table[lens_name], lens_url))

    # レンズの情報を取得する
    temp_list: List[Dict[str, any]] = []
    for lens_name, lens_url in lens_list:
        page = scraping.get_page(lens_url, encoding='cp932')
        temp: Dict[str, str] = {'レンズ名': lens_name, 'URL': lens_url}
        for tr_element in page.find_all('tr'):
            td_elements = tr_element.find_all('td')
            if len(td_elements) < 2:
                continue
            if 'bgcolor' not in td_elements[0].attrs:
                continue
            if td_elements[0].full_text == '' or td_elements[1].full_text == '':
                continue
            temp[td_elements[0].full_text] = td_elements[1].full_text
        for h2_element in page.find_all('h2'):
            text = h2_element.text
            if '希望小売価格' in text:
                temp['希望小売価格'] = text.replace('\n', '')
        temp_list.append(temp)
    df = DataFrame.from_records(temp_list)

    # 変換用に整形
    df['maker'] = 'COSINA'

    df['name'] = df['レンズ名']
    del df['レンズ名']

    df['product_number'] = ''

    w, t = extract_numbers(df['焦点距離'], [], [r'(\d+\.?\d*)mm'])
    df['wide_focal_length'] = [int(Decimal(x) * 2) for x in w]
    df['telephoto_focal_length'] = [int(Decimal(x) * 2) for x in w]
    del df['焦点距離']
    del df['画角']
    del df['レンズ構成']

    w, t = extract_numbers(df['name'], [], [r'F(\d+\.?\d*)'])
    df['wide_f_number'] = [float(x) for x in w]
    df['telephoto_f_number'] = [float(x) for x in t]
    del df['口径比']
    del df['最小絞り']
    del df['絞り羽根枚数']

    w, t = extract_numbers(df['最短撮影距離'], [], [r'(\d+\.?\d+)m'])
    df['wide_min_focus_distance'] = [int(Decimal(x).scaleb(3)) for x in w]
    df['telephoto_min_focus_distance'] = [int(Decimal(x).scaleb(3)) for x in t]
    del df['最短撮影距離']

    m: List[str] = []
    for record in df['最大撮影倍率']:
        m.append(regex(record, r'1(:|：)(\d+\.?\d*)')[1])
    df['max_photographing_magnification'] = [
        float(str((Decimal(1.0) / Decimal(x)).quantize(Decimal('0.01'))))
        for x in m
    ]
    del df['最大撮影倍率']

    m: List[int] = []
    for record in df['フィルターサイズ']:
        m.append(int(regex(record, r'(\d+)mm')[0]))
    df['filter_diameter'] = m
    del df['フィルターサイズ']

    df['is_drip_proof'] = False
    df['has_image_stabilization'] = False
    df['is_inner_zoom'] = True

    di, le = extract_numbers(df['最大径×全長'], [r'φ(\d+\.?\d*)×(\d+\.?\d*)mm'], [])
    df['overall_diameter'] = di
    df['overall_length'] = le
    del df['最大径×全長']

    weight, _ = extract_numbers(df['重量'], [], [r'(\d+)g'])
    df['weight'] = weight
    del df['重量']

    price, _ = extract_numbers(df['希望小売価格'], [], [r'￥([\d,]+)'])
    df['price'] = [int(x.replace(',', '')) for x in price]
    del df['希望小売価格']

    df['mount'] = 'マイクロフォーサーズ'
    df['url'] = df['URL']
    del df['レンズフード']
    del df['その他：']
    del df['URL']

    return df