Esempio n. 1
0
class FileSystem:
    def __init__(self):
        self.proxy_pool = ProxyPool()
        self.proxy_ip = self.proxy_pool.get()

    def file_download(self, url, file_type, file_name=str(random.random())):
        download_dir_path = "..\\download_files\\"
        if not os.path.exists(download_dir_path):
            os.mkdir(download_dir_path)
        download_file_path = download_dir_path + file_name + file_type
        if os.path.exists(download_file_path):
            return
        try_count = 0
        while True:
            try:
                download_file_path = download_dir_path + str(
                    random.random()) + file_type
                # html_analyse = HtmlAnalyse(url, proxy=self.proxy_ip)
                html_analyse = HtmlAnalyse(url)
                html_analyse.download(download_file_path)
                print("File Download Success !!")
                break
            except Exception as e:
                print(sys._getframe().f_code.co_name, url, e)
                try_count += 1
                if try_count > 2 and "https" in url:
                    return
                if try_count > 5:
                    return
                self.proxy_pool.remove(self.proxy_ip)
                self.proxy_ip = self.proxy_pool.get()
                # download_file_path = download_dir_path + str(random.random()) + file_type

        return download_file_path

    def file_upload(self, local_file_path):
        if not local_file_path:
            return
        while True:
            try:
                with open(local_file_path, "rb") as f:
                    res = requests.post(File_Server_Url, files={'file': f})
                    if res.status_code == 200:

                        res_j = res.json()
                        break
            except Exception as e:
                print(sys._getframe().f_code.co_name, e)
        server_file_path = res_j["path"]
        print("File Server Upload Success !!")
        return server_file_path

    def download_upload(self, url, file_type):
        download_file_path = self.file_download(url, file_type)
        server_file_path = self.file_upload(download_file_path)
        return server_file_path
Esempio n. 2
0
class Soudh:
    def __init__(self):
        self.proxy_pool = ProxyPool(flag=False)
        self.proxy_ip = self.proxy_pool.get()
        conn = MongoClient("10.10.101.22", 27017)
        self.db = conn.spider

    def get_suppliers(self):
        def thread_go(page_url):
            html_analyse = HtmlAnalyse(page_url)
            while True:
                try:
                    bs_content = html_analyse.get_bs_contents()
                    break
                except Exception as e:
                    print(e)
            ul_tag = bs_content.find(name="div",
                                     attrs={"class": "leftbox comlist"})
            li_tags = ul_tag.find_all(name="li")
            corporations = []
            for li_tag in li_tags:
                corporation = li_tag.text.strip()
                corporation_dict = {
                    "corporation": corporation,
                    "province_url": province_url,
                    "page_url": page_url,
                    "状态": "未完成"
                }
                corporations.append(corporation)
                col = self.db.All_Company_Name
                col.insert(corporation_dict)
            print(corporations)
            return corporations

        for province_id in range(1, 36):
            province_url = "http://www.soudh.com/province-" + str(
                province_id) + ".html"
            html_analyse = HtmlAnalyse(province_url)
            bs_content = html_analyse.get_bs_contents()
            page_tag = bs_content.find(name="span", text=re.compile(r'当前为'))
            page_count = int(re.match(r'.*?共(\d+)页', page_tag.text).group(1))
            page_urls = map(
                lambda page_num: province_url[:-5] + "-" + str(page_num) +
                ".html", range(1, page_count + 1))
            #
            # for page_url in page_urls:
            #     thread_go(page_url)

            threading_pool = ThreadingPool()
            threading_pool.multi_thread(thread_go, page_urls)
Esempio n. 3
0
class MouserGo:
    def __init__(self):
        self.proxy_pool = ProxyPool()
        self.proxy_ip = self.proxy_pool.get()
        self.mouser_host_url = "http://www.mouser.cn"
        self.my_session = requests.session()

    def get_all_category(self):
        while True:
            try:
                self.my_session.proxies.update(self.proxy_ip)
                my_headers = {'Connection': 'Keep-Alive',
                              'Accept-Language': 'en-US,en;q=0.8,zh-Hans-CN;q=0.5,zh-Hans;q=0.3',
                              'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
                              'Accept-Encoding': 'gzip, deflate, sdch',
                              "User-Agent": 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/53.0.2785.101 Safari/537.36',
                              "Host": "www.mouser.cn", "Upgrade-Insecure-Requests": "1",
                              "Referer": "http://www.mouser.cn/Electronic-Components/", }
                self.my_session.headers.update(my_headers)
                res = self.my_session.get("http://www.mouser.cn/Electronic-Components/")
                if res.status_code != 200:
                    print(res.status_code)
                    self.proxy_pool.remove(self.proxy_ip)
                    self.proxy_ip = self.proxy_pool.get()
                    continue
                bs_content = BeautifulSoup(res.content, "lxml")
                category_url_tags = bs_content.find_all(name="a", attrs={"class": "SearchResultsSubLevelCategory"})
                if not category_url_tags:
                    print(sys._getframe().f_code.co_name, "category_url_tag is None")
                    continue
                break
            except Exception as e:
                print(sys._getframe().f_code.co_name, e)
                self.proxy_pool.remove(self.proxy_ip)
                self.proxy_ip = self.proxy_pool.get()
        multi_category_structures = []
        for category_url_tag in category_url_tags:
            url = self.mouser_host_url + category_url_tag.get("href")[2:]
            single_category_structures = self.get_detail_category(url)
            multi_category_structures += single_category_structures
        return multi_category_structures

    def get_detail_category(self, url):
        while True:
            try:
                detail_headers = {'Connection': 'Keep-Alive',
                                  'Accept-Language': 'zh-CN,zh;q=0.8',
                                  'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
                                  'Accept-Encoding': 'gzip, deflate, sdch',
                                  "User-Agent": 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/53.0.2785.101 Safari/537.36',
                                  "Host": "www.mouser.cn", "Upgrade-Insecure-Requests": "1",
                                  "Referer": "http://www.mouser.cn/Electronic-Components/", }
                self.my_session.proxies.update(self.proxy_ip)
                self.my_session.headers.update(detail_headers)
                res = self.my_session.get(url, timeout=20)
                if res.status_code != 200:
                    print(res.status_code)
                    self.proxy_pool.remove(self.proxy_ip)
                    self.proxy_ip = self.proxy_pool.get()
                    continue
                bs_content = BeautifulSoup(res.content, "lxml")

                first_category_tag = bs_content.find(name="a",
                                                     id="ctl00_ContentMain_bc_rptrBreadcrumbs_ctl01_lnkBreadcrumb")
                if not first_category_tag:
                    self.proxy_pool.remove(self.proxy_ip)
                    print("None, go on")
                    self.proxy_ip = self.proxy_pool.get()
                    continue
                break
            except Exception as e:
                print(sys._getframe().f_code.co_name, e)
                self.proxy_pool.remove(self.proxy_ip)
                self.proxy_ip = self.proxy_pool.get()

        first_category_name = first_category_tag.text
        second_category_tag = bs_content.find(name="a", id="ctl00_ContentMain_bc_rptrBreadcrumbs_ctl02_lnkBreadcrumb")
        second_category_name = second_category_tag.text
        third_category_tag = bs_content.find(name="a", id="ctl00_ContentMain_bc_rptrBreadcrumbs_ctl03_lnkBreadcrumb")
        if third_category_tag:
            third_category_name = third_category_tag.text
        else:
            third_category_name = second_category_name

        detail_category_tags = bs_content.find_all(name="div", attrs={"class": "div-cat-title"})
        category_structures = []
        if detail_category_tags:
            pre_category_url = re.match(r"(.+)/_/.+/$", url).group(1)
            for detail_category_tag in detail_category_tags:
                forth_category_tag = detail_category_tag.a
                forth_category_name = forth_category_tag.text
                forth_category_url = pre_category_url + forth_category_tag.get("href")[5:]
                component_count = detail_category_tag.span.span.text.replace(",", "")
                category_structure = (
                    first_category_name, second_category_name, third_category_name, forth_category_name,
                    forth_category_url,
                    component_count)
                category_structures.append(category_structure)

        else:
            forth_category_name = third_category_name
            forth_category_url = url
            component_count_tag = bs_content.find(name="span", id="ctl00_ContentMain_lblProductCount")
            component_count = component_count_tag.text.replace("(", "").replace(")", "").replace(",", "")

            category_structure = (
                first_category_name, second_category_name, third_category_name, forth_category_name, forth_category_url,
                component_count)
            category_structures.append(category_structure)
        print(category_structures)
        return category_structures

    def category_to_csv(self, category_structure):
        with open("..\\Mouser.csv", "w", encoding="utf-8") as f:
            for category_structure in category_structure:
                modify_category_structure = []
                for category_name in category_structure:
                    modify_category_name = category_name.replace(",", ",")
                    modify_category_structure.append(modify_category_name)
                line = (",".join(modify_category_structure)) + "\n"
                f.write(line.encode().decode())

    def read_from_csv(self):
        csv_categories = []
        with open("..\\Mouser.csv", "r", encoding="utf-8") as f:
            read = csv.reader(f)
            for line in read:
                print(line)
                csv_categories.append(line)
        return csv_categories



    def thread_go(self, category_tree):
        first_category_name = category_tree[0].replace("\ufeff", "")
        second_category_name = str(category_tree[1:-2])
        url, component_count = category_tree[-2:]
        if component_count == 1:
            return
        my_headers = {'Connection': 'Keep-Alive',
                      'Accept-Language': 'zh-CN,zh;q=0.8',
                      'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
                      'Accept-Encoding': 'gzip, deflate, sdch',
                      "User-Agent": 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/49.0.2623.75 Safari/537.36',
                      "Host": "www.mouser.cn", "Upgrade-Insecure-Requests": "1", }
        for page_num in range(0, int(component_count), 25):
            page_url = url + "?No=" + str(page_num)
            count = 0
            while True:
                try:
                    self.my_session.headers.update(my_headers)
                    self.my_session.proxies.update(self.proxy_ip)
                    res = self.my_session.get(page_url, timeout=20)
                    if res.status_code != 200:
                        print(res.status_code)
                        self.proxy_pool.remove(self.proxy_ip)
                        self.proxy_ip = self.proxy_pool.get()
                        continue
                    bs_content = BeautifulSoup(res.content, "lxml")
                    component_tags = bs_content.find(name="table", attrs={"class": "SearchResultsTable"}).find_all(
                        name="tr", attrs={"class": re.compile(r"SearchResult")})

                    break
                except Exception as e:
                    count += 1
                    print(sys._getframe().f_code.co_name, e)
                    self.proxy_ip = self.proxy_pool.get()
                    if count > 20:
                        self.proxy_pool._refresh()



            table_header_tags = component_tags[0].find_all(name="th")[11:]

            for component_tag in component_tags[2:]:
                td_tags = component_tag.find_all(name="td")
                try:
                    rough_component_code = td_tags[3].text.strip()
                    no = len(rough_component_code)
                    for num, code_str in enumerate(rough_component_code):
                        if code_str == "\n":
                            no = num
                            break

                    component_code = rough_component_code[:no]
                except Exception as e:
                    print("component code is None", e)
                    continue
                try:
                    component_img = self.mouser_host_url + td_tags[1].find(name="img").get("src").replace("/sm/",
                                                                                                          "/images/")
                except:
                    component_img = ""
                try:
                    rough_attach = td_tags[6].find(name="a", text=re.compile(r".*数据表"))
                    component_attach = rough_attach.get("href")
                    if "http" not in component_attach:
                        component_attach = ""
                except Exception as e:
                    print("pdf is none", page_url, component_code)
                    component_attach = ""
                    # if not component_img:
                    #     continue
                try:
                    component_brand = td_tags[4].a.text
                except Exception as e:
                    print(sys._getframe().f_code.co_name, e)
                    continue

                component = (
                    component_code, component_brand, first_category_name, second_category_name, page_url,
                    component_attach,
                    component_img)
                count = 0
                try:
                    rohs_tag = td_tags[10]
                except Exception as e:
                    print(e)
                    continue

                property_key_values = []
                if rohs_tag.text == "详细信息":
                    key_value = ("RoHS", "Yes")
                    property_key_values.append(key_value)

                len_heads = len(table_header_tags)
                if len_heads:
                    for name_tag, property_tag in zip(table_header_tags, td_tags[-len_heads:]):

                        property_name = name_tag.text.strip()

                        property_value = property_tag.text.strip()
                        key_value = (property_name, property_value)
                        property_key_values.append(key_value)

                while True:
                    try:
                        orcl_conn = OracleSave(1000002)
                        orcl_conn.component_insert(component)
                        for key_value in property_key_values:
                            orcl_conn.properties_insert(key_value)
                        orcl_conn.commit()
                        orcl_conn.conn.close()

                        break
                    except Exception as e:
                        print(e)
                        count += 1
                        # if count > 3:
                        #     break

    def get_page_url(self, category_trees):
        pages_category = []
        for category_tree in category_trees:
            first_category_name = category_tree[0].replace("\ufeff", "")
            second_category_name = str(category_tree[1:-2])
            url, component_count = category_tree[-2:]
            if component_count == 1:
                continue
            for page_num in range(0, int(component_count), 25):
                page_url = url + "?No=" + str(page_num)
                page_category = (first_category_name, second_category_name, page_url)
                pages_category.append(page_category)
        return pages_category

    def page_thread_go(self, page_category):
        first_category_name, second_category_name, page_url = page_category
        count = 0
        my_headers = {'Connection': 'Keep-Alive',
                      'Accept-Language': 'zh-CN,zh;q=0.8',
                      'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
                      'Accept-Encoding': 'gzip, deflate, sdch',
                      "User-Agent": 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/49.0.2623.75 Safari/537.36',
                      "Host": "www.mouser.cn", "Upgrade-Insecure-Requests": "1", }
        while True:
            try:
                self.my_session.headers.update(my_headers)
                self.my_session.proxies.update(self.proxy_ip)
                res = self.my_session.get(page_url, timeout=20)
                if res.status_code != 200:
                    print(res.status_code)
                    self.proxy_pool.remove(self.proxy_ip)
                    self.proxy_ip = self.proxy_pool.get()
                    continue
                bs_content = BeautifulSoup(res.content, "lxml")
                component_tags = bs_content.find(name="table", attrs={"class": "SearchResultsTable"}).find_all(
                    name="tr", attrs={"class": re.compile(r"SearchResult")})

                break
            except Exception as e:
                count += 1
                print(sys._getframe().f_code.co_name, e)
                self.proxy_ip = self.proxy_pool.get()
                if count > 20:
                    self.proxy_pool._refresh()

        table_header_tags = component_tags[0].find_all(name="th")[11:]

        for component_tag in component_tags[2:]:
            td_tags = component_tag.find_all(name="td")
            try:
                rough_component_code = td_tags[3].text.strip()
                no = len(rough_component_code)
                for num, code_str in enumerate(rough_component_code):
                    if code_str == "\n":
                        no = num
                        break

                component_code = rough_component_code[:no]
            except Exception as e:
                print("component code is None", e)
                continue
            try:
                component_img = self.mouser_host_url + td_tags[1].find(name="img").get("src").replace("/sm/",
                                                                                                      "/images/")
            except:
                component_img = ""
            try:
                rough_attach = td_tags[6].find(name="a", text=re.compile(r".*数据表"))
                component_attach = rough_attach.get("href")
                if "http" not in component_attach:
                    component_attach = ""
            except Exception as e:
                print("pdf is none", page_url, component_code)
                component_attach = ""
                # if not component_img:
                #     continue
            try:
                component_brand = td_tags[4].a.text
            except Exception as e:
                print(sys._getframe().f_code.co_name, e)
                continue

            component = (
                component_code, component_brand, first_category_name, second_category_name, page_url,
                component_attach,
                component_img)
            count = 0
            try:
                rohs_tag = td_tags[10]
            except Exception as e:
                print(e)
                continue

            property_key_values = []
            if rohs_tag.text == "详细信息":
                key_value = ("RoHS", "Yes")
                property_key_values.append(key_value)

            len_heads = len(table_header_tags)
            if len_heads:
                for name_tag, property_tag in zip(table_header_tags, td_tags[-len_heads:]):
                    property_name = name_tag.text.strip()

                    property_value = property_tag.text.strip()
                    key_value = (property_name, property_value)
                    property_key_values.append(key_value)

            while True:
                try:
                    orcl_conn = OracleSave(1000002)
                    orcl_conn.component_insert(component)
                    for key_value in property_key_values:
                        orcl_conn.properties_insert(key_value)
                    orcl_conn.commit()
                    orcl_conn.conn.close()

                    break
                except Exception as e:
                    print("database save exception", e)
                    count += 1
Esempio n. 4
0
class RsGo:
    def __init__(self):
        self.proxy_pool = ProxyPool()
        self.proxy_ip = self.proxy_pool.get()

    def get_second_category(self):
        while True:
            try:

                html_analyse = HtmlAnalyse(
                    "http://china.rs-online.com/web/c/pcb-prototyping/pcb-cleaning/",
                    proxy=self.proxy_ip)
                bs_content = html_analyse.get_bs_contents()
                break
            except Exception as e:
                print(sys._getframe().f_code.co_name, e)
                self.proxy_ip = self.proxy_pool.get()
        first_categories = bs_content.find_all(
            name="div", attrs={"class": "horizontalMenu sectionUp"})
        second_categories = []
        for first_category in first_categories:
            first_category_name = first_category.span.text
            ul_tags = first_category.find_all(name="ul",
                                              attrs={"class": "column1"})
            for ul_tag in ul_tags:
                li_tags = ul_tag.find_all(name="li")
                for li_tag in li_tags:
                    second_category_url = Rs_Pre_Url + li_tag.a.get("href")
                    second_category_name = li_tag.a.text.replace(
                        li_tag.a.span.text, "").strip()
                    second_category = (first_category_name,
                                       second_category_name,
                                       second_category_url)
                    second_categories.append(second_category)
        return second_categories

    def get_page_url(self, second_category):
        first_category_name, second_category_name, second_category_url = second_category
        while True:
            try:
                html_analyse = HtmlAnalyse(second_category_url,
                                           proxy=self.proxy_ip)
                bs_content = html_analyse.get_bs_contents()
                break
            except Exception as e:
                print(sys._getframe().f_code.co_name, e)
                self.proxy_ip = self.proxy_pool.get()

        ul_tag = bs_content.find(name="ul", attrs={"class": "brcategories"})
        third_category_tags = ul_tag.find_all(
            name="div", attrs={"class": "rsGARealEstate"})
        for third_category_tag in third_category_tags:
            third_category_name = third_category_tag.a.text
            third_category_url = Rs_Pre_Url + third_category_tag.a.get("href")

            while True:
                try:
                    html_analyse = HtmlAnalyse(third_category_url,
                                               proxy=self.proxy_ip)

                    bs_content = html_analyse.get_bs_contents()
                    break
                except Exception as e:
                    print(sys._getframe().f_code.co_name, e)
                    self.proxy_ip = self.proxy_pool.get()
            try:
                page_tag = bs_content.find(name="div",
                                           attrs={
                                               "class": "viewProdDiv"
                                           }).text
            except Exception as e:
                print(third_category_url, e, "找不到page_tag")
                continue
            flag = re.match(r".*?共(.*?)个", page_tag)
            page_count = int(int(flag.group(1).strip()) / 20 + 1)
            for page_num in range(int(page_count)):
                page_url = third_category_url + "?pn=" + str(page_num + 1)
                while True:
                    try:

                        html_analyse = HtmlAnalyse(page_url,
                                                   proxy=self.proxy_ip)
                        bs_content = html_analyse.get_bs_contents()
                        break
                    except Exception as e:
                        print(sys._getframe().f_code.co_name, e)
                        self.proxy_ip = self.proxy_pool.get()
                component_url_tags = bs_content.find_all(
                    name="a", attrs={"class": "tnProdDesc"})
                page_attributes = []
                for component_url_tag in component_url_tags:
                    component_url = Rs_Pre_Url + component_url_tag.get("href")
                    union_category_name = second_category_name + "---" + third_category_name
                    page_attribute = (first_category_name, union_category_name,
                                      component_url)
                    page_attributes.append(page_attribute)
                #
                threadingpool = ThreadingPool(4)
                threadingpool.multi_process(self.thread_go, page_attributes)

                # for page_attribute in page_attributes:
                #     self.thread_go(page_attribute)

            continue

    def thread_go(self, page_attributes):
        cc_unit, cc_kiname, cc_url = page_attributes
        html_analyse = HtmlAnalyse(cc_url)
        while True:
            try:
                bs_content = html_analyse.get_bs_contents()
                break
            except Exception as e:
                print(sys._getframe().f_code.co_name, e)

        brand_tag = bs_content.find(name="span", attrs={"itemprop": "brand"})
        name_tag = bs_content.find(name="span", attrs={"itemprop": "mpn"})

        if not brand_tag or not name_tag:
            return
        cc_brandname = brand_tag.text.strip()

        cc_code = name_tag.text.strip()

        img_tag = bs_content.find(name="img", attrs={"itemprop": "image"})
        if not img_tag:
            cc_img = ""
        else:
            cc_img = Rs_Pre_Url + img_tag.get("src")

        attach_tag = bs_content.find(
            name="a",
            attrs={"onclick": re.compile(r"window\.open\('http://docs")})
        if not attach_tag:
            cc_attach = ""
        else:
            attach_name = attach_tag.get("onclick")
            try:
                cc_attach = re.match(r"window\.open\('(.*?\.pdf)'\)",
                                     attach_name).group(1)
            except Exception as e:
                print(sys._getframe().f_code.co_name, e)
                cc_attach = ""

        component = (cc_code, cc_brandname, cc_unit, cc_kiname, cc_url,
                     cc_attach, cc_img)

        # 器件属性
        while True:
            try:
                orcl_conn = OracleSave(1000005)
                orcl_conn.component_insert(component)
                component_properties = []
                tr_tags = bs_content.find_all(
                    name="tr", attrs={"class": re.compile(r"dr-table-row")})
                for tr_tag in tr_tags:
                    td_tags = tr_tag.find_all(name="td")
                    parameter_name = td_tags[1].text
                    parameter_value = td_tags[2].text
                    component_property = (parameter_name, parameter_value)
                    component_properties.append(component_property)

                    orcl_conn.properties_insert(component_property)
                orcl_conn.commit()
                break
            except Exception as e:
                print(sys._getframe().f_code.co_name, e)
            finally:
                orcl_conn.conn.close()

    def csv_write(self, category_structures):
        with open("..\\Rs-online.csv", "w", encoding="utf-8") as f:
            for category_structure in category_structures:
                modify_category_structure = []
                for structure_name in category_structure:
                    modify_structure_name = structure_name.replace(",", ",")
                    modify_category_structure.append(modify_structure_name)
                line = (",".join(modify_category_structure)) + "\n"
                f.write(line.encode().decode())

    def get_csv_categories(self):
        while True:
            try:
                html_analyse = HtmlAnalyse(
                    "http://china.rs-online.com/web/c/pcb-prototyping/pcb-cleaning/",
                    proxy=self.proxy_ip)
                bs_content = html_analyse.get_bs_contents()
                break
            except Exception as e:
                print(sys._getframe().f_code.co_name, e)
                self.proxy_pool.remove(self.proxy_ip)
                self.proxy_ip = self.proxy_pool.get()
        first_categories = bs_content.find_all(
            name="div", attrs={"class": "horizontalMenu sectionUp"})
        third_categories = []
        for first_category in first_categories:
            first_category_name = first_category.span.text
            ul_tags = first_category.find_all(name="ul",
                                              attrs={"class": "column1"})
            for ul_tag in ul_tags:
                li_tags = ul_tag.find_all(name="li")
                for li_tag in li_tags:
                    second_category_url = Rs_Pre_Url + li_tag.a.get("href")
                    second_category_name = li_tag.a.text.replace(
                        li_tag.a.span.text, "").strip()
                    while True:
                        try:
                            html_analyse = HtmlAnalyse(second_category_url,
                                                       proxy=self.proxy_ip)
                            bs_content = html_analyse.get_bs_contents()
                            ul_tag = bs_content.find(
                                name="ul", attrs={"class": "brcategories"})

                            break
                        except Exception as e:
                            print(sys._getframe().f_code.co_name, e,
                                  second_category_url)
                            self.proxy_pool.remove(self.proxy_ip)
                            self.proxy_ip = self.proxy_pool.get()
                    if ul_tag:
                        third_category_tags = ul_tag.find_all(
                            name="div", attrs={"class": "rsGARealEstate"})
                        for third_category_tag in third_category_tags:
                            third_category_name = third_category_tag.a.text
                            third_category_url = Rs_Pre_Url + third_category_tag.a.get(
                                "href")
                            third_category = (first_category_name,
                                              second_category_name,
                                              third_category_name,
                                              third_category_url)
                            print(third_category)
                            third_categories.append(third_category)
                    else:
                        third_category = (first_category_name,
                                          second_category_name,
                                          second_category_name,
                                          second_category_url)
                        print(third_category)
                        third_categories.append(third_category)
        return third_categories
Esempio n. 5
0
class PdfDownload:
    def __init__(self, task_code):
        self.task_code = task_code

        self.proxy_pool = ProxyPool()
        self.proxy_ip = self.proxy_pool.get()

        self.path = "..\\tmp\\"
        if not os.path.exists(self.path):
            os.mkdir(self.path)
        self.db = OracleConnection()

    def write(self):
        with open(self.path + "text.txt", 'w') as f:
            f.write('aaa')

    def get_urls_from_db(self):
        cursor = self.db.conn.cursor()
        cursor.execute(
            "update product$component set cmp_attach=null where cmp_attach='None'"
        )
        # 去除与之前爬取pdf重复的
        cursor.execute(
            "merge into product$component_crawl a using ( select cc_b2c_attach,cc_attach from product$component_crawl where cc_b2c_attach is not null group by cc_b2c_attach,cc_attach ) b on (a.cc_attach = b.cc_attach ) when matched then update set a.cc_b2c_attach = b.cc_b2c_attach where a.cc_b2c_attach is null"
        )
        cursor.execute(
            "select distinct cc_attach from product$component_crawl where cc_b2c_attach is null and cc_attach is not null and cc_task=(select cct_id from product$component_crawl_task where cct_taskid='{}')"
            .format(self.task_code))
        pdf_datas = cursor.fetchall()
        cursor.close()
        self.db.conn.commit()
        self.db.conn.close()

        pdf_urls = []
        for pdf_data in pdf_datas:
            # if re.match(r'.*?\.pdf', pdf_data[0]):
            pdf_urls.append(pdf_data[0])
        return pdf_urls

    def download(self, pdf_url):
        filename = self.path + str(random.random()) + '.pdf'
        try:
            html_analyse = HtmlAnalyse(pdf_url, proxy=self.proxy_ip)
            html_analyse.download(filename)
            print("下载完成。。。")
        except Exception as e:
            print(e)
            self.proxy_pool.remove(self.proxy_ip)
            self.proxy_ip = self.proxy_pool.get()
            self.download(pdf_url)

        return filename

    def upload(self, filename, pdf_url):
        try:
            with open(filename, 'rb') as file:
                res = requests.post("http://10.10.100.200:9999/file/upload",
                                    files={'file': file})
                res_j = res.json()
            print("上传完成")
            db = OracleConnection()
            cursor = db.conn.cursor()
            cursor.execute(
                "update product$component_crawl set cc_b2c_attach='{}' where cc_attach='{}'"
                .format(res_j['path'], pdf_url))
            cursor.close()
            db.conn.commit()
            db.conn.close()

        except Exception as e:
            print(e)
            self.upload(filename, pdf_url)

    def go(self):
        pdf_urls = self.get_urls_from_db()
        for pdf_url in pdf_urls:
            filename = self.download(pdf_url)
            self.upload(filename, pdf_url)

    def thread_go(self):
        pdf_urls = self.get_urls_from_db()

        def thread(pdfurl):
            filename = self.download(pdfurl)
            self.upload(filename, pdfurl)

        threading_pool = ThreadingPool()
        threading_pool.multi_thread(thread, pdf_urls)
Esempio n. 6
0
class MLCC1Detail:
    def __init__(self, second_class):
        self.first_class_name, self.second_class_name, self.url, self.page_count = second_class
        self.proxy_pool = ProxyPool()

        self.proxy_ip = self.proxy_pool.get()

    def get_class_components(self):
        page_urls = map(lambda num: self.url + "&p=" + str(num),
                        range(1, self.page_count + 1))

        return page_urls

    def get_page_components(self, page_url):
        my_headers = First_Headers
        my_cookies = First_Cookies

        while True:
            try:
                my_session = requests.session()
                my_session.headers.update(my_headers)
                my_session.cookies.update(my_cookies)
                my_session.proxies.update(self.proxy_ip)
                pass
            except Exception as e:
                print(sys._getframe().f_code.co_name, e)
                time.sleep(1)
                self.proxy_pool.remove(self.proxy_ip)
                self.proxy_ip = self.proxy_pool.get()
                continue
            try:
                res = my_session.get(page_url, timeout=15)
                content = res.content.decode()
            except Exception as e:
                print(e)
                self.proxy_pool.remove(self.proxy_ip)
                self.proxy_ip = self.proxy_pool.get()
                continue

            if res.status_code == 200 and content:
                break
            else:
                self.proxy_pool.remove(self.proxy_ip)
                self.proxy_ip = self.proxy_pool.get()

        bs_content = BeautifulSoup(content, "lxml")
        product_tags = bs_content.find_all(
            name="li", attrs={"data-id": re.compile(r'\d+')})

        many_components_properties = []
        for product_tag in product_tags:
            all_p_tags = product_tag.find_all(name="p")
            try:
                product_code = all_p_tags[0].b.a.text
            except:
                continue

            product_brand = all_p_tags[0].find(name='a', id='brand_n').text

            product_parameter = all_p_tags[0].find(name="a", id="params").text
            try:
                product_pdf = product_tag.find(name="a",
                                               attrs={
                                                   "data-id": "pdf"
                                               }).get("href")
                if "http://" not in product_pdf:
                    product_pdf = Pre_Url + product_pdf
            except Exception as e:
                print(sys._getframe().f_code.co_name, e)
                product_pdf = ""

            component = (product_code, product_brand, self.first_class_name,
                         self.second_class_name, page_url, product_pdf, "null")

            properties = [
                ("product_parameter", product_parameter),
            ]
            try:
                product_details = all_p_tags[3].find_all(name="span")
            except Exception as e:
                print(e)
                product_details = ""
            for product_detail in product_details:
                detail_text = product_detail.text.split(":")
                try:
                    key_value = (detail_text[0], detail_text[1])
                except Exception as e:
                    print(e)
                    key_value = (detail_text[0], "")
                properties.append(key_value)
            component_properties = (component, properties)
            many_components_properties.append(component_properties)
        return many_components_properties
Esempio n. 7
0
class SearchList:
    def __init__(self):
        self.proxy_pool = ProxyPool()
        self.page_count = ""

    def get_all_urls(self, key_word):
        while True:
            self.proxy_ip = self.proxy_pool.get()

            my_session = requests.session()
            tianyan_headers = TianYan_Headers
            tianyan_headers["Referer"] = (
                "http://www.tianyancha.com/search?key=" + key_word +
                "&checkFrom=searchBox").encode().decode('latin-1')
            my_session.headers.update(tianyan_headers)
            try:
                my_session.proxies.update(self.proxy_ip)
            except Exception as e:
                print(e)
                time.sleep(1)
                continue
            try:
                first_res = my_session.get(
                    "http://www.tianyancha.com/tongji/" + key_word +
                    ".json?random=" +
                    str(round(time.time(), 3)).replace(".", ""),
                    timeout=15)
                first_content = first_res.content
                first_data_v = eval(first_content)["data"]["v"]
            except Exception as e:
                print(e)
                self.proxy_pool.remove(self.proxy_ip)

                continue
            if first_res.status_code != 200 or not first_content:
                self.proxy_pool.remove(self.proxy_ip)
                continue
            first_token = re.match(r".*?token=(.*?);.*?",
                                   str(bytes(eval(first_data_v)))).group(1)

            my_cookie = TianYan_Cookies

            my_cookie["token"] = first_token

            my_session.cookies.update(my_cookie)
            try:
                real_res = my_session.get("http://www.tianyancha.com/search/" +
                                          key_word + ".json?",
                                          timeout=15)

                content = real_res.content.decode()
            except Exception as e:
                print(e)
                self.proxy_pool.remove(self.proxy_ip)
                continue
            if first_res.status_code != 200 or not content:
                self.proxy_pool.remove(self.proxy_ip)
                continue
            break
        try:

            json_list = json.loads(content)
            brief_companies = json_list["data"]

        except Exception as e:
            print(e)
            return

        while True:
            try:
                conn = MongoClient("10.10.101.22", 27017)
                if not brief_companies:
                    print(key_word, "无数据")
                    col = conn.spider.All_Company_Name
                    col.update({"corporation": key_word},
                               {'$set': {
                                   "状态": "无数据"
                               }},
                               multi=True)
                    conn.close()
                    return

                for brief_company in brief_companies:
                    company_id = brief_company["id"]
                    detail_company_url = "http://www.tianyancha.com/company/" + str(
                        company_id)
                    detail_company = {
                        "company_id": company_id,
                        "url": detail_company_url,
                        "状态": "未完成"
                    }
                    detail_col = conn.spider.All_Company_Info
                    detail_col.update({"company_id": company_id},
                                      {'$set': detail_company},
                                      upsert=True)
                col = conn.spider.All_Company_Name
                col.update({"corporation": key_word}, {'$set': {
                    "状态": "已完成"
                }},
                           multi=True)
                print(key_word, "已完成")
                conn.close()
                break
            except Exception as e:
                print(e)
                continue
Esempio n. 8
0
class DetailInfo:
    def __init__(self):
        self.proxy_pool = ProxyPool()

        self.proxy_ip = self.proxy_pool.get()

    def get_detail(self, url):
        while True:
            my_session = requests.session()
            my_headers = TianYan_Detail_Headers
            my_headers["Referer"] = url
            my_session.headers.update(TianYan_Detail_Headers)
            try:
                my_session.proxies.update(self.proxy_ip)
            except Exception as e:
                print(e)
                time.sleep(1)
                self.proxy_pool.remove(self.proxy_ip)
                self.proxy_ip = self.proxy_pool.get()
                continue

            try:
                first_res = my_session.get(url.replace("company", "tongji") + ".json?random=" + str(
                    round(time.time(), 3)).replace(".", ""), timeout=15)
                first_content = first_res.content
                first_data_v = eval(first_content)["data"]["v"]
            except Exception as e:
                print(e)
                self.proxy_pool.remove(self.proxy_ip)
                self.proxy_ip = self.proxy_pool.get()
                continue
            if first_res.status_code != 200 or not first_content:
                print(first_res.status_code)
                self.proxy_pool.remove(self.proxy_ip)
                self.proxy_ip = self.proxy_pool.get()
                continue
            first_token = re.match(r".*?token=(.*?);.*?", str(bytes(eval(first_data_v)))).group(1)
            another = re.match(r".*?\{return'(.*?)'", str(bytes(eval(first_data_v)))).group(1)

            def get_wtf(another):
                data = another.split(",")
                secret = "6,b,t,f,2,z,l,5,w,h,q,i,s,e,c,p,m,u,9,8,y,k,j,r,x,n,-,0,3,4,d,1,a,o,7,v,g".split(",")
                wtf = ""
                for i in data:
                    wtf += str(secret[int(i)])
                return wtf
            first_wtf = get_wtf(another)

            my_cookie = TianYan_Detail_Cookies

            my_cookie["token"] = first_token
            my_cookie["_utm"] = first_wtf
            my_headers["CheckError"] = "check"
            my_headers["Referer"] = url

            my_session.cookies.update(my_cookie)
            my_session.headers.update(my_headers)

            try:
                real_res = my_session.get(url + ".json", timeout=15)

                content = real_res.content.decode()
            except Exception as e:
                print(e)
                self.proxy_pool.remove(self.proxy_ip)
                self.proxy_ip = self.proxy_pool.get()
                continue
            if real_res.status_code != 200 or not content:
                print(real_res.status_code)
                self.proxy_pool.remove(self.proxy_ip)
                self.proxy_ip = self.proxy_pool.get()
                continue
            try:
                json_list = json.loads(content)
                brief_companies = json_list["data"]
                break
            except Exception as e:
                print(e)

        conn = MongoClient("10.10.101.22", 27017)

        col = conn.spider.All_Company_Info
        if not brief_companies:
            print(url, "无数据")
            col.update({"url": url}, {'$set': {"状态": "无数据"}})
        else:
            while True:
                try:
                    col.update({"url": url}, {'$set': {"data": brief_companies, "状态": "已完成"}})
                    print(url, "已完成")
                    break
                except Exception as e:
                    print(e)
        conn.close()

        return
Esempio n. 9
0
class Category:
    def __init__(self):
        self.proxy_pool = ProxyPool()

    def get_categories(self):
        main_url = "http://www.chip1stop.com/web/CHN/zh/dispClassSearchTop.do"
        self.proxy_ip = self.proxy_pool.get()
        while True:
            try:
                html_analsye = HtmlAnalyse(main_url, proxy=self.proxy_ip)
                bs_content = html_analsye.get_bs_contents()
                break
            except Exception as e:
                print(sys._getframe().f_code.co_name, e)
                self.proxy_pool.remove(self.proxy_ip)
                self.proxy_ip = self.proxy_pool.get()

        dl_tags = bs_content.find_all(
            name="dl", attrs={"class": "categoryListDl clearfix"})

        second_categories = []
        for dl_tag in dl_tags:
            first_directory_name = dl_tag.dt.text
            second_directory_tags = dl_tag.find_all(name="dd")
            for second_directory_tag in second_directory_tags:
                rough_second_directory_name = second_directory_tag.text
                second_directory_name = re.match(
                    r"(.*?)\[", rough_second_directory_name).group(1).strip()
                second_directory_url = "http://www.chip1stop.com/web/CHN/zh" + second_directory_tag.span.a.get(
                    "href")[1:]
                second_directory = (first_directory_name,
                                    second_directory_name,
                                    second_directory_url)
                second_categories.append(second_directory)
        return second_categories

    def get_product_list(self):
        categories = self.get_categories()
        form_data = {
            "nextSearchIndex": "0",
            "dispPageNo": "1",
            "dispNum": "100",
            "type": "page"
        }
        request_headers = {
            "Accept": "text/html, */*; q=0.01",
            "Accept-Encoding": "gzip, deflate",
            "Accept-Language": "zh-CN,zh;q=0.8",
            "Content-Type": "application/x-www-form-urlencoded",
            "Origin": "http://www.chip1stop.com",
            "Host": "www.chip1stop.com",
            "Proxy-Connection": "keep-alive",
            "User-Agent":
            "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/56.0.2924.14 Safari/537.36",
            "X-Requested-With": "XMLHttpRequest",
        }
        request_cookie = {
            "CK_005": "jS1czThT76C51HOUQ42UtQ06TsvRnzI105VoKAixt4s=",
            "CK_002": "aYWM1+FztffTlWgoCLG3iw==",
            "CK_001": "v1gP31jjkR0=",
            "CK_007": "cPDwiM71wuQ=",
            "CK_006": "kQp2UYR7V1g=",
            "CK_008": "i0dI70Swgcs=",
            "WMONID": "VvpmCoTZsss",
            "_gat": "1",
            "_ga": "GA1.2.1422864444.1488415703",
            "JSESSIONIDVERSION": "2f633173:8",
            "JSESSIONID": "b7d640d0a05a7885ab3cab0168cf.ap2",
            "JREPLICA": "c1-instance6",
            "id":
            "27e37541744912b7||t=1486458155|et=730|cs=002213fd4869c45d604be72033",
            "Referer": "https://www.chip1stop.com/web/CHN/zh/login.do"
        }
        complete_form_data = {
            "nextSearchIndex": "0",
            "dispPageNo": "1",
            "dispNum": "25",
            "rental": "false",
            "partSameFlg": "false",
            "subWinSearchFlg": "false",
            "used": "false",
            "newProductFlg": "false",
            "newProudctHandlingFlg": "false",
            "newSameDayShippedFlg": "false",
            "eventId": "0001",
            "searchType": "2",
            "dispAllFlg": "true",
        }

        def thread_go(page_no):
            print("Page:", page_no)

            page_parts = range(0, 25, 5)
            for page_part in page_parts:
                print("Part:", page_part)
                # def thread_go(page_part):
                complete_form_data['nextSearchIndex'] = page_part
                complete_form_data['dispPageNo'] = page_no
                complete_form_data['type'] = "page"
                detail_url = second_category_url + "&dispPageNo=%d" % page_no

                while True:
                    try:
                        my_session.cookies.update(request_cookie)
                        res = my_session.post(detail_url,
                                              data=complete_form_data,
                                              proxies=self.proxy_ip,
                                              timeout=20)
                        print(res.status_code)
                        if res.status_code == 200:
                            content = res.content.decode()
                            bs_content = BeautifulSoup(content, "lxml")
                            tr_tags = bs_content.find_all(name="tr")[1:]
                            if tr_tags:
                                break
                        else:
                            self.proxy_pool.remove(self.proxy_ip)
                            self.proxy_ip = self.proxy_pool.get()
                    except Exception as e:
                        print(sys._getframe().f_code.co_name, e)
                        self.proxy_pool.remove(self.proxy_ip)
                        self.proxy_ip = self.proxy_pool.get()
                tr_tags = bs_content.find_all(name="tr")[1:]
                if not tr_tags:
                    continue
                # 数据库连接
                orcl_conn = OracleSave(1000001)

                for tr_tag in tr_tags:
                    try:
                        code = tr_tag.td.find(name="p",
                                              attrs={
                                                  "class": "text14pt2 bold"
                                              }).text.strip()
                    except Exception as e:
                        print(e)
                        continue

                    chip1stop_code = tr_tag.td.find(name="p",
                                                    attrs={
                                                        "class": "text10"
                                                    }).text.strip()
                    print(chip1stop_code)
                    maker = tr_tag.td.find(name="p",
                                           attrs={
                                               "class": "text10 wordBreak"
                                           }).text.strip()
                    pdf_url = tr_tag.find(
                        name="a",
                        attrs={
                            "href":
                            re.compile(
                                r"http://download\.siliconexpert\.com/pdfs")
                        })
                    if pdf_url:
                        pdf_url = pdf_url.get("href")

                    component = (code, maker, first_category_name,
                                 second_category_name, second_category_url,
                                 pdf_url, None)
                    orcl_conn.component_insert(component)

                    property_tags = tr_tag.find_all(name="td")[6:-1]
                    for property_name, property_tag in zip(
                            property_names, property_tags):
                        if property_name == '购买/询价':
                            continue
                        property_value = property_tag.text
                        if property_value:
                            property_value = property_value.strip()
                        single_property = (property_name, property_value)
                        orcl_conn.properties_insert(single_property)

                orcl_conn.commit()
                orcl_conn.conn.close()

        # def thread_go(page_no):

        for category in categories:
            first_category_name, second_category_name, second_category_url = category

            while True:
                try:
                    my_session = requests.session()
                    my_session.headers.update(request_headers)
                    self.proxy_ip = self.proxy_pool.get()
                    res = my_session.post(second_category_url,
                                          data=form_data,
                                          proxies=self.proxy_ip,
                                          timeout=10)
                    print(res.status_code)
                    if res.status_code == 200:
                        break
                    self.proxy_pool.remove(self.proxy_ip)
                    self.proxy_ip = self.proxy_pool.get()
                except Exception as e:
                    print(sys._getframe().f_code.co_name, e)
                    self.proxy_pool.remove(self.proxy_ip)
            content = res.content.decode()
            bs_content = BeautifulSoup(content, "lxml")
            products_count = bs_content.find(name="span",
                                             attrs={
                                                 "class": "bold_red"
                                             }).text.replace(",", "").replace(
                                                 "件", "")
            table_headers_tag = bs_content.find(name="tr",
                                                attrs={"class": "parent"})
            property_name_tags = table_headers_tag.find_all(name="td")
            # 器件参数名称
            property_names = []
            for property_name_tag in property_name_tags[6:-1]:
                property_name = property_name_tag.text.strip()
                property_names.append(property_name)
            # 总页数
            pages_count = int(int(products_count) / 25) + 1

            if pages_count > 400:
                pages_count = 400

            # for page_no in range(1, pages_count + 1):

            # ---------------------------我是分割线----------------------------

            # threading_pool = ThreadingPool(10)
            # threading_pool.multi_process(thread_go, list(range(1, pages_count + 1)))

            for i in range(1, pages_count + 1):
                thread_go(i)
Esempio n. 10
0
class FPNewark:
    def __init__(self):
        self.my_session = requests.session()
        self.proxy_pool = ProxyPool()
        self.proxy_ip = self.proxy_pool.get()
        pass

    def get_category_trees(self, category_trees):
        multi_category_trees = []
        for category_tree in category_trees:
            url = category_tree[-2]
            count = 0
            while True:
                try:
                    self.my_session.proxies.update(self.proxy_ip)
                    res = self.my_session.get(url, timeout=20)
                    if res.status_code != 200:
                        print(res.status_code)
                        continue
                    bs_content = BeautifulSoup(res.content, "lxml")

                    break
                except Exception as e:
                    count += 1
                    print(sys._getframe().f_code.co_name, url, e)
                    self.proxy_ip = self.proxy_pool.get()
                    if count > 100:
                        self.proxy_pool._refresh()

            category_list = bs_content.find(name="ul",
                                            attrs={"class": "categoryList"})
            if not category_list:
                print(category_tree)
                multi_category_trees.append(category_tree)
                continue
            else:

                child_category_tags = category_list.find_all(name="a")

                category_trees = []
                for child_category_tag in child_category_tags:
                    child_category_url = child_category_tag.get("href")
                    rough_child_category_tag = child_category_tag.text.strip()
                    flag = re.match(r"(.*?) \((\d+.*?)\)",
                                    rough_child_category_tag)
                    child_category_name = flag.group(1)

                    component_count = flag.group(2).replace(",", "")
                    if component_count == '1':
                        continue

                    child_category = [
                        child_category_name, child_category_url,
                        component_count
                    ]
                    child_category_tree = list(
                        category_tree)[:-2] + child_category
                    category_trees.append(child_category_tree)
                child_categories = self.get_category_trees(category_trees)
                print(child_categories)
                multi_category_trees += child_categories
            print("Current Count: ", len(multi_category_trees))
        return multi_category_trees

    def get_first_categories(self):
        my_headers = Default_Header
        my_headers["host"] = "www.newark.com"
        my_headers["Referer"] = "http://www.newark.com/"
        my_headers["Upgrade-Insecure-Requests"] = "1"
        while True:
            try:
                self.my_session.headers.update(my_headers)
                self.my_session.proxies.update(self.proxy_ip)

                res = self.my_session.get(
                    "http://www.newark.com/browse-for-products", timeout=20)
                if res.status_code != 200:
                    print(res.status_code)
                    continue
                bs_content = BeautifulSoup(res.content, "lxml")
                first_category_tags = bs_content.find_all(
                    name="ul", attrs={"categoryList"})
                break
            except Exception as e:
                print("Part1", sys._getframe().f_code.co_name, e)
                self.proxy_ip = self.proxy_pool.get()

        second_pages = []
        for first_category_tag in first_category_tags:
            first_category_name = first_category_tag.li.h2.text.strip()
            second_category_tags = first_category_tag.li.ul.find_all(name="li")
            for second_category_tag in second_category_tags:
                second_category_url = second_category_tag.a.get("href")
                rough_second_category_name = second_category_tag.text.strip()
                flag = re.match(r"(.*?) \((\d+.*?)\)",
                                rough_second_category_name)
                second_category_name = flag.group(1)

                component_count = flag.group(2).replace(",", "")
                if component_count == '1':
                    continue
                second_page = (first_category_name, second_category_name,
                               second_category_url, component_count)
                second_pages.append(second_page)
        return second_pages

    def csv_write(self, category_structures):
        with open("..\\Newark_test.csv", "w", encoding="utf-8") as f:
            for category_structure in category_structures:
                modify_category_structure = []
                for structure_name in category_structure:
                    modify_structure_name = structure_name.replace(",", ",")
                    modify_category_structure.append(modify_structure_name)
                line = (",".join(modify_category_structure)) + "\n"
                f.write(line.encode().decode())

    #
    def thread_go(self, category_tree):
        my_headers = Default_Header
        my_headers["host"] = "www.newark.com"
        my_headers["Referer"] = "http://www.newark.com/"
        my_headers["Upgrade-Insecure-Requests"] = "1"
        first_category_name = category_tree[0]
        second_category_name = str(category_tree[1:-2])
        url, component_count = category_tree[-2:]
        page_count = int(int(component_count) / 25) + 1
        for page_num in range(877, page_count + 1):
            page_url = url + "/prl/results/" + str(page_num)
            count = 0
            while True:
                try:
                    self.my_session.headers.update(my_headers)
                    self.my_session.proxies.update(self.proxy_ip)
                    res = self.my_session.get(page_url, timeout=20)
                    if res.status_code != 200:
                        print(res.status_code)
                        self.proxy_pool.remove(self.proxy_ip)
                        self.proxy_ip = self.proxy_pool.get()
                        continue
                    bs_content = BeautifulSoup(res.content, "lxml")
                    component_tags = bs_content.find(
                        name="table", id="sProdList").tbody.find_all(name="tr")
                    break
                except Exception as e:
                    count += 1
                    print(sys._getframe().f_code.co_name, e)
                    self.proxy_ip = self.proxy_pool.get()
                    if count > 10:
                        print(category_tree, page_url)
                        component_tags = []
                        break

                    if count > 100:
                        self.proxy_pool._refresh()

            for component_tag in component_tags:
                detail_table = component_tag.find(name="table",
                                                  attrs={"class": "TFtable"})
                td_tags = component_tag.find_all(name="td")
                try:
                    component_code = td_tags[1].text.strip()
                except Exception as e:
                    print("component code is None", e)
                    continue
                try:
                    component_img = td_tags[1].find(name="img",
                                                    attrs={
                                                        "class":
                                                        "productThumbnail"
                                                    }).get("src")
                except:
                    component_img = ""
                try:
                    rough_attach = td_tags[2].find(name="a", text="数据表")
                    if not rough_attach:
                        rough_attach = td_tags[2].find(
                            name="a", attrs={"class": "prodDetailsAttachment"})
                    component_attach = rough_attach.get("href")
                    if "http" not in component_attach:
                        component_attach = ""
                except Exception as e:
                    component_attach = ""
                try:
                    manufacture_description = td_tags[3].a.find_all(name="p")
                    component_brand = manufacture_description[0].text.strip()
                    component_description = manufacture_description[
                        1].text.strip()
                except Exception as e:
                    component_brand = ""
                    print(sys._getframe().f_code.co_name, e)
                    continue
                if not component_img and not component_attach and not component_brand:
                    continue

                component = (component_code, component_brand,
                             first_category_name, second_category_name,
                             page_url, component_attach, component_img)
                count = 0
                while True:
                    try:
                        orcl_conn = OracleSave(1000003)
                        orcl_conn.component_insert(component)
                        if detail_table:
                            property_tags = detail_table.find_all(name="tr")
                            for property_tag in property_tags:
                                detail_td_tags = property_tag.find_all("td")
                                property_name = detail_td_tags[0].text.strip()
                                property_value = detail_td_tags[1].text.strip()
                                key_value = (property_name, property_value)
                                orcl_conn.properties_insert(key_value)
                        orcl_conn.commit()
                        orcl_conn.conn.close()

                        break
                    except Exception as e:
                        print(e)
                        count += 1
                        # if count > 3:
                        #     break

    def extra_go(self, category_tree):
        my_headers = Default_Header
        my_headers["host"] = "www.newark.com"
        my_headers["Referer"] = "http://www.newark.com/"
        my_headers["Upgrade-Insecure-Requests"] = "1"
        first_category_name = category_tree[0]
        second_category_name = str(category_tree[1:-2])
        url, component_count = category_tree[-2:]
        page_count = int(int(component_count) / 25) + 1
        page_range = range(875, 17557)

        def extra_thread(page_num):
            page_url = url + "/prl/results/" + str(page_num)
            count = 0
            while True:
                try:
                    self.my_session.headers.update(my_headers)
                    self.my_session.proxies.update(self.proxy_ip)
                    res = self.my_session.get(page_url, timeout=20)
                    if res.status_code != 200:
                        print(res.status_code)
                        self.proxy_pool.remove(self.proxy_ip)
                        self.proxy_ip = self.proxy_pool.get()
                        continue
                    bs_content = BeautifulSoup(res.content, "lxml")
                    component_tags = bs_content.find(
                        name="table", id="sProdList").tbody.find_all(name="tr")
                    break
                except Exception as e:
                    count += 1
                    print(sys._getframe().f_code.co_name, e)
                    self.proxy_ip = self.proxy_pool.get()
                    if count > 10:
                        print(category_tree, page_url)
                        component_tags = []
                        break

                    if count > 100:
                        self.proxy_pool._refresh()

            for component_tag in component_tags:
                detail_table = component_tag.find(name="table",
                                                  attrs={"class": "TFtable"})
                td_tags = component_tag.find_all(name="td")
                try:
                    component_code = td_tags[1].text.strip()
                except Exception as e:
                    print("component code is None", e)
                    continue
                try:
                    component_img = td_tags[1].find(name="img",
                                                    attrs={
                                                        "class":
                                                        "productThumbnail"
                                                    }).get("src")
                except:
                    component_img = ""
                try:
                    rough_attach = td_tags[2].find(name="a", text="数据表")
                    if not rough_attach:
                        rough_attach = td_tags[2].find(
                            name="a", attrs={"class": "prodDetailsAttachment"})
                    component_attach = rough_attach.get("href")
                    if "http" not in component_attach:
                        component_attach = ""
                except Exception as e:
                    component_attach = ""
                try:
                    manufacture_description = td_tags[3].a.find_all(name="p")
                    component_brand = manufacture_description[0].text.strip()
                    component_description = manufacture_description[
                        1].text.strip()
                except Exception as e:
                    component_brand = ""
                    print(sys._getframe().f_code.co_name, e)
                    continue
                if not component_img and not component_attach and not component_brand:
                    continue

                component = (component_code, component_brand,
                             first_category_name, second_category_name,
                             page_url, component_attach, component_img)
                count = 0
                while True:
                    try:
                        orcl_conn = OracleSave(1000003)
                        orcl_conn.component_insert(component)
                        if detail_table:
                            property_tags = detail_table.find_all(name="tr")
                            for property_tag in property_tags:
                                detail_td_tags = property_tag.find_all("td")
                                property_name = detail_td_tags[0].text.strip()
                                property_value = detail_td_tags[1].text.strip()
                                key_value = (property_name, property_value)
                                orcl_conn.properties_insert(key_value)
                        orcl_conn.commit()
                        orcl_conn.conn.close()

                        break
                    except Exception as e:
                        print(e)
                        count += 1
                        # if count > 3:
                        #     break

        extra_threading = ThreadingPool(8)
        extra_threading.multi_process(extra_thread, page_range)

    def read_from_csv(self):
        csv_categories = []
        with open("..\\Newark_test.csv", "r", encoding="utf-8") as f:
            read = csv.reader(f)
            for line in read:
                print(line)
                csv_categories.append(line)
        return csv_categories
Esempio n. 11
0
class DownloadUpload:
    def __init__(self):
        self.proxy_pool = ProxyPool()
        self.proxy_ip = self.proxy_pool.get()
        pass

    def file_download(self, url, file_type, file_name=str(random.random())):
        download_dir_path = "..\\download_files\\"
        if not os.path.exists(download_dir_path):
            os.mkdir(download_dir_path)
        download_file_path = download_dir_path + file_name + file_type
        if os.path.exists(download_file_path):
            return
        try_count = 0
        while True:
            try:
                download_file_path = download_dir_path + str(random.random()) + file_type
                # html_analyse = HtmlAnalyse(url, proxy=self.proxy_ip)
                my_session = requests.session()
                my_headers = {'Connection': 'Keep-Alive',
                              'Accept-Language': 'en-US,en;q=0.8,zh-Hans-CN;q=0.5,zh-Hans;q=0.3',
                              'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
                              'Accept-Encoding': 'gzip, deflate, sdch',
                              "User-Agent": 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/57.0.2987.133 Safari/537.36',
                              "host": "www.newark.com", "Referer": "http://www.newark.com/",
                              "Upgrade-Insecure-Requests": "1"}
                my_session.headers.update(my_headers)
                my_session.proxies.update(self.proxy_ip)
                res = my_session.get(url)
                if res.status_code != 200 or not res.content:
                    print(sys._getframe().f_lineno)
                    continue
                with open(download_file_path, 'wb') as f:
                    f.write(res.content)

                print("File Download Success !!")
                break
            except Exception as e:
                print(sys._getframe().f_code.co_name, url, e)
                try_count += 1
                # if try_count > 2 and "https" in url:
                #     return
                # if try_count > 5:
                #     return
                self.proxy_pool.remove(self.proxy_ip)
                self.proxy_ip = self.proxy_pool.get()
                # download_file_path = download_dir_path + str(random.random()) + file_type

        return download_file_path

    def file_upload(self, local_file_path):
        if not local_file_path:
            return
        while True:
            try:
                with open(local_file_path, "rb") as f:
                    res = requests.post(File_Server_Url, files={'file': f})
                    if res.status_code == 200:
                        res_j = res.json()
                        break
            except Exception as e:
                print(sys._getframe().f_code.co_name, e)
        server_file_path = res_j["path"]
        print("File Server Upload Success !!")
        return server_file_path

    def download_upload(self, url, file_type):
        download_file_path = self.file_download(url, file_type)
        server_file_path = self.file_upload(download_file_path)
        return server_file_path
Esempio n. 12
0
    "Accept-Encoding": "gzip, deflate",
    "Accept-Language": "zh-CN,zh;q=0.8",
    "Content-Type": "application/x-www-form-urlencoded",
    "Origin": "http://www.chip1stop.com",
    "Referer":
    "http://www.chip1stop.com/web/CHN/zh/search.do?classCd=040101&classLv=3&subWinSearchFlg=false&searchType=2&dispAllFlg=true&searchFlg=false",
    "Host": "www.chip1stop.com",
    "Proxy-Connection": "keep-alive",
    "User-Agent":
    "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/56.0.2924.14 Safari/537.36",
    "X-Requested-With": "XMLHttpRequest",
}

if __name__ == "__main__":
    proxy_pool = ProxyPool()
    proxy_ip = proxy_pool.get()
    my_session = requests.session()
    my_session.headers.update(request_headers)
    while True:
        try:
            res = my_session.post(
                "http://www.chip1stop.com/web/CHN/zh/search.do?",
                data=form_data,
                proxies=proxy_ip)
            print(res.status_code)
            break
        except Exception as e:
            print(e)
            proxy_pool.remove(proxy_ip)
            proxy_ip = proxy_pool.get()
    content = res.content.decode()
Esempio n. 13
0
class PdfDownload1:
    def __init__(self, task_code):
        self.task_code = task_code

        self.proxy_pool = ProxyPool()
        self.proxy_ip = self.proxy_pool.get()

        self.path = "D:\pdf\\"
        if not os.path.exists(self.path):
            os.mkdir(self.path)
        self.db = OracleConnection()

    def write(self):
        with open(self.path + "text.txt", 'w') as f:
            f.write('aaa')

    def get_urls_from_db(self):
        cursor = self.db.conn.cursor()
        cursor.execute(
            "update product$component set cmp_attach=null where cmp_attach='None'"
        )
        # 去除与之前爬取pdf重复的
        cursor.execute(
            "merge into product$component_crawl a using ( select cc_b2c_attach,cc_attach from product$component_crawl where cc_b2c_attach is not null group by cc_b2c_attach,cc_attach ) b on (a.cc_attach = b.cc_attach ) when matched then update set a.cc_b2c_attach = b.cc_b2c_attach where a.cc_b2c_attach is null"
        )
        cursor.execute(
            "select distinct cc_attach from product$component_crawl where cc_b2c_attach is null and cc_attach is not null and cc_task=(select cct_id from product$component_crawl_task where cct_taskid='{}')"
            .format(self.task_code))
        pdf_datas = cursor.fetchall()
        cursor.close()
        self.db.conn.commit()
        self.db.conn.close()

        pdf_urls = []
        for pdf_data in pdf_datas:
            # if re.match(r'.*?\.pdf', pdf_data[0]):
            pdf_urls.append(pdf_data[0])
        return pdf_urls

    def download(self, pdf_url):
        content_list = re.match(r'downloadLinkClick\((.*?)\);return false',
                                a).group(1).split(",")
        filename = content_list[0].replace("'", "")

        url = "http://ds.yuden.co.jp/TYCOMPAS/cs/detail.do?mode=download&fileName=" + filename

        isSeriesData = content_list[1]
        isProductsData = content_list[2]
        isProductsDataGraph = content_list[3]
        DownloadForm = {
            "action": "detail.do",
            "classificationID": "AE",
            "fileName": filename,
            "isSeriesData": isSeriesData,
            "isProductsData": isProductsData,
            "isProductsDataGraph": isProductsDataGraph
        }
        html_analyse = HtmlAnalyse(url)
        html_analyse.post_download(
            data=DownloadForm,
            path="I:\PythonPrj\StandardSpider\DataAnalyse\\NewRules\\a.pdf")

        filename = self.path + str(random.random()) + '.pdf'
        try:
            html_analyse = HtmlAnalyse(url, proxy=self.proxy_ip)
            html_analyse.download(filename)
            print("下载完成。。。")
        except Exception as e:
            print(e)
            self.proxy_pool.remove(self.proxy_ip)
            self.proxy_ip = self.proxy_pool.get()
            self.download(pdf_url)

        return filename

    def upload(self, filename, pdf_url):
        try:
            with open(filename, 'rb') as file:
                res = requests.post("http://10.10.100.200:9999/file/upload",
                                    files={'file': file})
                res_j = res.json()
            print("上传完成")
            db = OracleConnection()
            cursor = db.conn.cursor()
            cursor.execute(
                "update product$component_crawl set cc_b2c_attach='{}' where cc_attach='{}'"
                .format(res_j['path'], pdf_url))
            cursor.close()
            db.conn.commit()
            db.conn.close()

        except Exception as e:
            print(e)
            self.upload(filename, pdf_url)

    def go(self):
        pdf_urls = self.get_urls_from_db()
        for pdf_url in pdf_urls:
            filename = self.download(pdf_url)
            self.upload(filename, pdf_url)

    def thread_go(self):
        pdf_urls = self.get_urls_from_db()

        def thread(pdfurl):
            filename = self.download(pdfurl)
            self.upload(filename, pdfurl)

        threading_pool = ThreadingPool()
        threading_pool.multi_thread(thread, pdf_urls)