Ejemplo n.º 1
0
class MouserGo:
    def __init__(self):
        self.proxy_pool = ProxyPool()
        self.proxy_ip = self.proxy_pool.get()
        self.mouser_host_url = "http://www.mouser.cn"
        self.my_session = requests.session()

    def get_all_category(self):
        while True:
            try:
                self.my_session.proxies.update(self.proxy_ip)
                my_headers = {'Connection': 'Keep-Alive',
                              'Accept-Language': 'en-US,en;q=0.8,zh-Hans-CN;q=0.5,zh-Hans;q=0.3',
                              'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
                              'Accept-Encoding': 'gzip, deflate, sdch',
                              "User-Agent": 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/53.0.2785.101 Safari/537.36',
                              "Host": "www.mouser.cn", "Upgrade-Insecure-Requests": "1",
                              "Referer": "http://www.mouser.cn/Electronic-Components/", }
                self.my_session.headers.update(my_headers)
                res = self.my_session.get("http://www.mouser.cn/Electronic-Components/")
                if res.status_code != 200:
                    print(res.status_code)
                    self.proxy_pool.remove(self.proxy_ip)
                    self.proxy_ip = self.proxy_pool.get()
                    continue
                bs_content = BeautifulSoup(res.content, "lxml")
                category_url_tags = bs_content.find_all(name="a", attrs={"class": "SearchResultsSubLevelCategory"})
                if not category_url_tags:
                    print(sys._getframe().f_code.co_name, "category_url_tag is None")
                    continue
                break
            except Exception as e:
                print(sys._getframe().f_code.co_name, e)
                self.proxy_pool.remove(self.proxy_ip)
                self.proxy_ip = self.proxy_pool.get()
        multi_category_structures = []
        for category_url_tag in category_url_tags:
            url = self.mouser_host_url + category_url_tag.get("href")[2:]
            single_category_structures = self.get_detail_category(url)
            multi_category_structures += single_category_structures
        return multi_category_structures

    def get_detail_category(self, url):
        while True:
            try:
                detail_headers = {'Connection': 'Keep-Alive',
                                  'Accept-Language': 'zh-CN,zh;q=0.8',
                                  'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
                                  'Accept-Encoding': 'gzip, deflate, sdch',
                                  "User-Agent": 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/53.0.2785.101 Safari/537.36',
                                  "Host": "www.mouser.cn", "Upgrade-Insecure-Requests": "1",
                                  "Referer": "http://www.mouser.cn/Electronic-Components/", }
                self.my_session.proxies.update(self.proxy_ip)
                self.my_session.headers.update(detail_headers)
                res = self.my_session.get(url, timeout=20)
                if res.status_code != 200:
                    print(res.status_code)
                    self.proxy_pool.remove(self.proxy_ip)
                    self.proxy_ip = self.proxy_pool.get()
                    continue
                bs_content = BeautifulSoup(res.content, "lxml")

                first_category_tag = bs_content.find(name="a",
                                                     id="ctl00_ContentMain_bc_rptrBreadcrumbs_ctl01_lnkBreadcrumb")
                if not first_category_tag:
                    self.proxy_pool.remove(self.proxy_ip)
                    print("None, go on")
                    self.proxy_ip = self.proxy_pool.get()
                    continue
                break
            except Exception as e:
                print(sys._getframe().f_code.co_name, e)
                self.proxy_pool.remove(self.proxy_ip)
                self.proxy_ip = self.proxy_pool.get()

        first_category_name = first_category_tag.text
        second_category_tag = bs_content.find(name="a", id="ctl00_ContentMain_bc_rptrBreadcrumbs_ctl02_lnkBreadcrumb")
        second_category_name = second_category_tag.text
        third_category_tag = bs_content.find(name="a", id="ctl00_ContentMain_bc_rptrBreadcrumbs_ctl03_lnkBreadcrumb")
        if third_category_tag:
            third_category_name = third_category_tag.text
        else:
            third_category_name = second_category_name

        detail_category_tags = bs_content.find_all(name="div", attrs={"class": "div-cat-title"})
        category_structures = []
        if detail_category_tags:
            pre_category_url = re.match(r"(.+)/_/.+/$", url).group(1)
            for detail_category_tag in detail_category_tags:
                forth_category_tag = detail_category_tag.a
                forth_category_name = forth_category_tag.text
                forth_category_url = pre_category_url + forth_category_tag.get("href")[5:]
                component_count = detail_category_tag.span.span.text.replace(",", "")
                category_structure = (
                    first_category_name, second_category_name, third_category_name, forth_category_name,
                    forth_category_url,
                    component_count)
                category_structures.append(category_structure)

        else:
            forth_category_name = third_category_name
            forth_category_url = url
            component_count_tag = bs_content.find(name="span", id="ctl00_ContentMain_lblProductCount")
            component_count = component_count_tag.text.replace("(", "").replace(")", "").replace(",", "")

            category_structure = (
                first_category_name, second_category_name, third_category_name, forth_category_name, forth_category_url,
                component_count)
            category_structures.append(category_structure)
        print(category_structures)
        return category_structures

    def category_to_csv(self, category_structure):
        with open("..\\Mouser.csv", "w", encoding="utf-8") as f:
            for category_structure in category_structure:
                modify_category_structure = []
                for category_name in category_structure:
                    modify_category_name = category_name.replace(",", ",")
                    modify_category_structure.append(modify_category_name)
                line = (",".join(modify_category_structure)) + "\n"
                f.write(line.encode().decode())

    def read_from_csv(self):
        csv_categories = []
        with open("..\\Mouser.csv", "r", encoding="utf-8") as f:
            read = csv.reader(f)
            for line in read:
                print(line)
                csv_categories.append(line)
        return csv_categories



    def thread_go(self, category_tree):
        first_category_name = category_tree[0].replace("\ufeff", "")
        second_category_name = str(category_tree[1:-2])
        url, component_count = category_tree[-2:]
        if component_count == 1:
            return
        my_headers = {'Connection': 'Keep-Alive',
                      'Accept-Language': 'zh-CN,zh;q=0.8',
                      'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
                      'Accept-Encoding': 'gzip, deflate, sdch',
                      "User-Agent": 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/49.0.2623.75 Safari/537.36',
                      "Host": "www.mouser.cn", "Upgrade-Insecure-Requests": "1", }
        for page_num in range(0, int(component_count), 25):
            page_url = url + "?No=" + str(page_num)
            count = 0
            while True:
                try:
                    self.my_session.headers.update(my_headers)
                    self.my_session.proxies.update(self.proxy_ip)
                    res = self.my_session.get(page_url, timeout=20)
                    if res.status_code != 200:
                        print(res.status_code)
                        self.proxy_pool.remove(self.proxy_ip)
                        self.proxy_ip = self.proxy_pool.get()
                        continue
                    bs_content = BeautifulSoup(res.content, "lxml")
                    component_tags = bs_content.find(name="table", attrs={"class": "SearchResultsTable"}).find_all(
                        name="tr", attrs={"class": re.compile(r"SearchResult")})

                    break
                except Exception as e:
                    count += 1
                    print(sys._getframe().f_code.co_name, e)
                    self.proxy_ip = self.proxy_pool.get()
                    if count > 20:
                        self.proxy_pool._refresh()



            table_header_tags = component_tags[0].find_all(name="th")[11:]

            for component_tag in component_tags[2:]:
                td_tags = component_tag.find_all(name="td")
                try:
                    rough_component_code = td_tags[3].text.strip()
                    no = len(rough_component_code)
                    for num, code_str in enumerate(rough_component_code):
                        if code_str == "\n":
                            no = num
                            break

                    component_code = rough_component_code[:no]
                except Exception as e:
                    print("component code is None", e)
                    continue
                try:
                    component_img = self.mouser_host_url + td_tags[1].find(name="img").get("src").replace("/sm/",
                                                                                                          "/images/")
                except:
                    component_img = ""
                try:
                    rough_attach = td_tags[6].find(name="a", text=re.compile(r".*数据表"))
                    component_attach = rough_attach.get("href")
                    if "http" not in component_attach:
                        component_attach = ""
                except Exception as e:
                    print("pdf is none", page_url, component_code)
                    component_attach = ""
                    # if not component_img:
                    #     continue
                try:
                    component_brand = td_tags[4].a.text
                except Exception as e:
                    print(sys._getframe().f_code.co_name, e)
                    continue

                component = (
                    component_code, component_brand, first_category_name, second_category_name, page_url,
                    component_attach,
                    component_img)
                count = 0
                try:
                    rohs_tag = td_tags[10]
                except Exception as e:
                    print(e)
                    continue

                property_key_values = []
                if rohs_tag.text == "详细信息":
                    key_value = ("RoHS", "Yes")
                    property_key_values.append(key_value)

                len_heads = len(table_header_tags)
                if len_heads:
                    for name_tag, property_tag in zip(table_header_tags, td_tags[-len_heads:]):

                        property_name = name_tag.text.strip()

                        property_value = property_tag.text.strip()
                        key_value = (property_name, property_value)
                        property_key_values.append(key_value)

                while True:
                    try:
                        orcl_conn = OracleSave(1000002)
                        orcl_conn.component_insert(component)
                        for key_value in property_key_values:
                            orcl_conn.properties_insert(key_value)
                        orcl_conn.commit()
                        orcl_conn.conn.close()

                        break
                    except Exception as e:
                        print(e)
                        count += 1
                        # if count > 3:
                        #     break

    def get_page_url(self, category_trees):
        pages_category = []
        for category_tree in category_trees:
            first_category_name = category_tree[0].replace("\ufeff", "")
            second_category_name = str(category_tree[1:-2])
            url, component_count = category_tree[-2:]
            if component_count == 1:
                continue
            for page_num in range(0, int(component_count), 25):
                page_url = url + "?No=" + str(page_num)
                page_category = (first_category_name, second_category_name, page_url)
                pages_category.append(page_category)
        return pages_category

    def page_thread_go(self, page_category):
        first_category_name, second_category_name, page_url = page_category
        count = 0
        my_headers = {'Connection': 'Keep-Alive',
                      'Accept-Language': 'zh-CN,zh;q=0.8',
                      'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
                      'Accept-Encoding': 'gzip, deflate, sdch',
                      "User-Agent": 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/49.0.2623.75 Safari/537.36',
                      "Host": "www.mouser.cn", "Upgrade-Insecure-Requests": "1", }
        while True:
            try:
                self.my_session.headers.update(my_headers)
                self.my_session.proxies.update(self.proxy_ip)
                res = self.my_session.get(page_url, timeout=20)
                if res.status_code != 200:
                    print(res.status_code)
                    self.proxy_pool.remove(self.proxy_ip)
                    self.proxy_ip = self.proxy_pool.get()
                    continue
                bs_content = BeautifulSoup(res.content, "lxml")
                component_tags = bs_content.find(name="table", attrs={"class": "SearchResultsTable"}).find_all(
                    name="tr", attrs={"class": re.compile(r"SearchResult")})

                break
            except Exception as e:
                count += 1
                print(sys._getframe().f_code.co_name, e)
                self.proxy_ip = self.proxy_pool.get()
                if count > 20:
                    self.proxy_pool._refresh()

        table_header_tags = component_tags[0].find_all(name="th")[11:]

        for component_tag in component_tags[2:]:
            td_tags = component_tag.find_all(name="td")
            try:
                rough_component_code = td_tags[3].text.strip()
                no = len(rough_component_code)
                for num, code_str in enumerate(rough_component_code):
                    if code_str == "\n":
                        no = num
                        break

                component_code = rough_component_code[:no]
            except Exception as e:
                print("component code is None", e)
                continue
            try:
                component_img = self.mouser_host_url + td_tags[1].find(name="img").get("src").replace("/sm/",
                                                                                                      "/images/")
            except:
                component_img = ""
            try:
                rough_attach = td_tags[6].find(name="a", text=re.compile(r".*数据表"))
                component_attach = rough_attach.get("href")
                if "http" not in component_attach:
                    component_attach = ""
            except Exception as e:
                print("pdf is none", page_url, component_code)
                component_attach = ""
                # if not component_img:
                #     continue
            try:
                component_brand = td_tags[4].a.text
            except Exception as e:
                print(sys._getframe().f_code.co_name, e)
                continue

            component = (
                component_code, component_brand, first_category_name, second_category_name, page_url,
                component_attach,
                component_img)
            count = 0
            try:
                rohs_tag = td_tags[10]
            except Exception as e:
                print(e)
                continue

            property_key_values = []
            if rohs_tag.text == "详细信息":
                key_value = ("RoHS", "Yes")
                property_key_values.append(key_value)

            len_heads = len(table_header_tags)
            if len_heads:
                for name_tag, property_tag in zip(table_header_tags, td_tags[-len_heads:]):
                    property_name = name_tag.text.strip()

                    property_value = property_tag.text.strip()
                    key_value = (property_name, property_value)
                    property_key_values.append(key_value)

            while True:
                try:
                    orcl_conn = OracleSave(1000002)
                    orcl_conn.component_insert(component)
                    for key_value in property_key_values:
                        orcl_conn.properties_insert(key_value)
                    orcl_conn.commit()
                    orcl_conn.conn.close()

                    break
                except Exception as e:
                    print("database save exception", e)
                    count += 1
Ejemplo n.º 2
0
class FPNewark:
    def __init__(self):
        self.my_session = requests.session()
        self.proxy_pool = ProxyPool()
        self.proxy_ip = self.proxy_pool.get()
        pass

    def get_category_trees(self, category_trees):
        multi_category_trees = []
        for category_tree in category_trees:
            url = category_tree[-2]
            count = 0
            while True:
                try:
                    self.my_session.proxies.update(self.proxy_ip)
                    res = self.my_session.get(url, timeout=20)
                    if res.status_code != 200:
                        print(res.status_code)
                        continue
                    bs_content = BeautifulSoup(res.content, "lxml")

                    break
                except Exception as e:
                    count += 1
                    print(sys._getframe().f_code.co_name, url, e)
                    self.proxy_ip = self.proxy_pool.get()
                    if count > 100:
                        self.proxy_pool._refresh()

            category_list = bs_content.find(name="ul",
                                            attrs={"class": "categoryList"})
            if not category_list:
                print(category_tree)
                multi_category_trees.append(category_tree)
                continue
            else:

                child_category_tags = category_list.find_all(name="a")

                category_trees = []
                for child_category_tag in child_category_tags:
                    child_category_url = child_category_tag.get("href")
                    rough_child_category_tag = child_category_tag.text.strip()
                    flag = re.match(r"(.*?) \((\d+.*?)\)",
                                    rough_child_category_tag)
                    child_category_name = flag.group(1)

                    component_count = flag.group(2).replace(",", "")
                    if component_count == '1':
                        continue

                    child_category = [
                        child_category_name, child_category_url,
                        component_count
                    ]
                    child_category_tree = list(
                        category_tree)[:-2] + child_category
                    category_trees.append(child_category_tree)
                child_categories = self.get_category_trees(category_trees)
                print(child_categories)
                multi_category_trees += child_categories
            print("Current Count: ", len(multi_category_trees))
        return multi_category_trees

    def get_first_categories(self):
        my_headers = Default_Header
        my_headers["host"] = "www.newark.com"
        my_headers["Referer"] = "http://www.newark.com/"
        my_headers["Upgrade-Insecure-Requests"] = "1"
        while True:
            try:
                self.my_session.headers.update(my_headers)
                self.my_session.proxies.update(self.proxy_ip)

                res = self.my_session.get(
                    "http://www.newark.com/browse-for-products", timeout=20)
                if res.status_code != 200:
                    print(res.status_code)
                    continue
                bs_content = BeautifulSoup(res.content, "lxml")
                first_category_tags = bs_content.find_all(
                    name="ul", attrs={"categoryList"})
                break
            except Exception as e:
                print("Part1", sys._getframe().f_code.co_name, e)
                self.proxy_ip = self.proxy_pool.get()

        second_pages = []
        for first_category_tag in first_category_tags:
            first_category_name = first_category_tag.li.h2.text.strip()
            second_category_tags = first_category_tag.li.ul.find_all(name="li")
            for second_category_tag in second_category_tags:
                second_category_url = second_category_tag.a.get("href")
                rough_second_category_name = second_category_tag.text.strip()
                flag = re.match(r"(.*?) \((\d+.*?)\)",
                                rough_second_category_name)
                second_category_name = flag.group(1)

                component_count = flag.group(2).replace(",", "")
                if component_count == '1':
                    continue
                second_page = (first_category_name, second_category_name,
                               second_category_url, component_count)
                second_pages.append(second_page)
        return second_pages

    def csv_write(self, category_structures):
        with open("..\\Newark_test.csv", "w", encoding="utf-8") as f:
            for category_structure in category_structures:
                modify_category_structure = []
                for structure_name in category_structure:
                    modify_structure_name = structure_name.replace(",", ",")
                    modify_category_structure.append(modify_structure_name)
                line = (",".join(modify_category_structure)) + "\n"
                f.write(line.encode().decode())

    #
    def thread_go(self, category_tree):
        my_headers = Default_Header
        my_headers["host"] = "www.newark.com"
        my_headers["Referer"] = "http://www.newark.com/"
        my_headers["Upgrade-Insecure-Requests"] = "1"
        first_category_name = category_tree[0]
        second_category_name = str(category_tree[1:-2])
        url, component_count = category_tree[-2:]
        page_count = int(int(component_count) / 25) + 1
        for page_num in range(877, page_count + 1):
            page_url = url + "/prl/results/" + str(page_num)
            count = 0
            while True:
                try:
                    self.my_session.headers.update(my_headers)
                    self.my_session.proxies.update(self.proxy_ip)
                    res = self.my_session.get(page_url, timeout=20)
                    if res.status_code != 200:
                        print(res.status_code)
                        self.proxy_pool.remove(self.proxy_ip)
                        self.proxy_ip = self.proxy_pool.get()
                        continue
                    bs_content = BeautifulSoup(res.content, "lxml")
                    component_tags = bs_content.find(
                        name="table", id="sProdList").tbody.find_all(name="tr")
                    break
                except Exception as e:
                    count += 1
                    print(sys._getframe().f_code.co_name, e)
                    self.proxy_ip = self.proxy_pool.get()
                    if count > 10:
                        print(category_tree, page_url)
                        component_tags = []
                        break

                    if count > 100:
                        self.proxy_pool._refresh()

            for component_tag in component_tags:
                detail_table = component_tag.find(name="table",
                                                  attrs={"class": "TFtable"})
                td_tags = component_tag.find_all(name="td")
                try:
                    component_code = td_tags[1].text.strip()
                except Exception as e:
                    print("component code is None", e)
                    continue
                try:
                    component_img = td_tags[1].find(name="img",
                                                    attrs={
                                                        "class":
                                                        "productThumbnail"
                                                    }).get("src")
                except:
                    component_img = ""
                try:
                    rough_attach = td_tags[2].find(name="a", text="数据表")
                    if not rough_attach:
                        rough_attach = td_tags[2].find(
                            name="a", attrs={"class": "prodDetailsAttachment"})
                    component_attach = rough_attach.get("href")
                    if "http" not in component_attach:
                        component_attach = ""
                except Exception as e:
                    component_attach = ""
                try:
                    manufacture_description = td_tags[3].a.find_all(name="p")
                    component_brand = manufacture_description[0].text.strip()
                    component_description = manufacture_description[
                        1].text.strip()
                except Exception as e:
                    component_brand = ""
                    print(sys._getframe().f_code.co_name, e)
                    continue
                if not component_img and not component_attach and not component_brand:
                    continue

                component = (component_code, component_brand,
                             first_category_name, second_category_name,
                             page_url, component_attach, component_img)
                count = 0
                while True:
                    try:
                        orcl_conn = OracleSave(1000003)
                        orcl_conn.component_insert(component)
                        if detail_table:
                            property_tags = detail_table.find_all(name="tr")
                            for property_tag in property_tags:
                                detail_td_tags = property_tag.find_all("td")
                                property_name = detail_td_tags[0].text.strip()
                                property_value = detail_td_tags[1].text.strip()
                                key_value = (property_name, property_value)
                                orcl_conn.properties_insert(key_value)
                        orcl_conn.commit()
                        orcl_conn.conn.close()

                        break
                    except Exception as e:
                        print(e)
                        count += 1
                        # if count > 3:
                        #     break

    def extra_go(self, category_tree):
        my_headers = Default_Header
        my_headers["host"] = "www.newark.com"
        my_headers["Referer"] = "http://www.newark.com/"
        my_headers["Upgrade-Insecure-Requests"] = "1"
        first_category_name = category_tree[0]
        second_category_name = str(category_tree[1:-2])
        url, component_count = category_tree[-2:]
        page_count = int(int(component_count) / 25) + 1
        page_range = range(875, 17557)

        def extra_thread(page_num):
            page_url = url + "/prl/results/" + str(page_num)
            count = 0
            while True:
                try:
                    self.my_session.headers.update(my_headers)
                    self.my_session.proxies.update(self.proxy_ip)
                    res = self.my_session.get(page_url, timeout=20)
                    if res.status_code != 200:
                        print(res.status_code)
                        self.proxy_pool.remove(self.proxy_ip)
                        self.proxy_ip = self.proxy_pool.get()
                        continue
                    bs_content = BeautifulSoup(res.content, "lxml")
                    component_tags = bs_content.find(
                        name="table", id="sProdList").tbody.find_all(name="tr")
                    break
                except Exception as e:
                    count += 1
                    print(sys._getframe().f_code.co_name, e)
                    self.proxy_ip = self.proxy_pool.get()
                    if count > 10:
                        print(category_tree, page_url)
                        component_tags = []
                        break

                    if count > 100:
                        self.proxy_pool._refresh()

            for component_tag in component_tags:
                detail_table = component_tag.find(name="table",
                                                  attrs={"class": "TFtable"})
                td_tags = component_tag.find_all(name="td")
                try:
                    component_code = td_tags[1].text.strip()
                except Exception as e:
                    print("component code is None", e)
                    continue
                try:
                    component_img = td_tags[1].find(name="img",
                                                    attrs={
                                                        "class":
                                                        "productThumbnail"
                                                    }).get("src")
                except:
                    component_img = ""
                try:
                    rough_attach = td_tags[2].find(name="a", text="数据表")
                    if not rough_attach:
                        rough_attach = td_tags[2].find(
                            name="a", attrs={"class": "prodDetailsAttachment"})
                    component_attach = rough_attach.get("href")
                    if "http" not in component_attach:
                        component_attach = ""
                except Exception as e:
                    component_attach = ""
                try:
                    manufacture_description = td_tags[3].a.find_all(name="p")
                    component_brand = manufacture_description[0].text.strip()
                    component_description = manufacture_description[
                        1].text.strip()
                except Exception as e:
                    component_brand = ""
                    print(sys._getframe().f_code.co_name, e)
                    continue
                if not component_img and not component_attach and not component_brand:
                    continue

                component = (component_code, component_brand,
                             first_category_name, second_category_name,
                             page_url, component_attach, component_img)
                count = 0
                while True:
                    try:
                        orcl_conn = OracleSave(1000003)
                        orcl_conn.component_insert(component)
                        if detail_table:
                            property_tags = detail_table.find_all(name="tr")
                            for property_tag in property_tags:
                                detail_td_tags = property_tag.find_all("td")
                                property_name = detail_td_tags[0].text.strip()
                                property_value = detail_td_tags[1].text.strip()
                                key_value = (property_name, property_value)
                                orcl_conn.properties_insert(key_value)
                        orcl_conn.commit()
                        orcl_conn.conn.close()

                        break
                    except Exception as e:
                        print(e)
                        count += 1
                        # if count > 3:
                        #     break

        extra_threading = ThreadingPool(8)
        extra_threading.multi_process(extra_thread, page_range)

    def read_from_csv(self):
        csv_categories = []
        with open("..\\Newark_test.csv", "r", encoding="utf-8") as f:
            read = csv.reader(f)
            for line in read:
                print(line)
                csv_categories.append(line)
        return csv_categories