Ejemplo n.º 1
0
    def thread_go(self):
        pdf_urls = self.get_urls_from_db()

        def thread(pdfurl):
            filename = self.download(pdfurl)
            self.upload(filename, pdfurl)

        threading_pool = ThreadingPool()
        threading_pool.multi_thread(thread, pdf_urls)
Ejemplo n.º 2
0
    def all_go(self):
        get_urls = GetUrls()
        first_classes = get_urls.get_first_classes()
        for first_class in first_classes:
            second_classes = get_urls.get_second_classes(first_class)
            for second_class in second_classes:
                self.detail = MLCC1Detail(second_class)
                page_urls = self.detail.get_class_components()

                threadingpool = ThreadingPool(4)
                threadingpool.multi_process(self.thread_go, page_urls)
Ejemplo n.º 3
0
    def get_suppliers(self):
        def thread_go(page_url):
            html_analyse = HtmlAnalyse(page_url)
            while True:
                try:
                    bs_content = html_analyse.get_bs_contents()
                    break
                except Exception as e:
                    print(e)
            company_tags = bs_content.find_all(name="a",
                                               attrs={
                                                   "target": "_blank",
                                                   "href": re.compile(r"/\d+")
                                               })
            corporations = []
            for company_tag in company_tags:
                corporation = company_tag.text.strip()
                corporation_dict = {
                    "corporation": corporation,
                    "province_url": city_url,
                    "page_url": page_url,
                    "状态": "未完成",
                    "from": "99114"
                }
                corporations.append(corporation)
                col = self.db.All_Company_Name
                col.insert(corporation_dict)
            print(corporations)
            return corporations

        html_analyse = HtmlAnalyse("http://shop.99114.com/")
        bs_content = html_analyse.get_bs_contents()
        all_city_tags = bs_content.find_all(
            name="a",
            attrs={"href": re.compile(r"http://shop\.99114\.com/list/area")})
        for city_tag in all_city_tags:
            city_url = city_tag.get("href")
            html_analyse = HtmlAnalyse(city_url)
            bs_content = html_analyse.get_bs_contents()
            page_tag = bs_content.find_all(
                name="a", attrs={"href": re.compile(r"/list/area/")})[-2]
            page_count = int(page_tag.text.replace(",", ""))
            page_urls = map(
                lambda page_num: city_url[:-1] + str(page_num) + ".html",
                range(1, page_count + 1))

            # for page_url in page_urls:
            #     thread_go(page_url)

            threading_pool = ThreadingPool(12)
            threading_pool.multi_process(thread_go, page_urls)
Ejemplo n.º 4
0
    def get_suppliers(self):
        def thread_go(page_url):
            html_analyse = HtmlAnalyse(page_url)
            while True:
                try:
                    bs_content = html_analyse.get_bs_contents()
                    break
                except Exception as e:
                    print(e)
            ul_tag = bs_content.find(name="div",
                                     attrs={"class": "leftbox comlist"})
            li_tags = ul_tag.find_all(name="li")
            corporations = []
            for li_tag in li_tags:
                corporation = li_tag.text.strip()
                corporation_dict = {
                    "corporation": corporation,
                    "province_url": province_url,
                    "page_url": page_url,
                    "状态": "未完成"
                }
                corporations.append(corporation)
                col = self.db.All_Company_Name
                col.insert(corporation_dict)
            print(corporations)
            return corporations

        for province_id in range(1, 36):
            province_url = "http://www.soudh.com/province-" + str(
                province_id) + ".html"
            html_analyse = HtmlAnalyse(province_url)
            bs_content = html_analyse.get_bs_contents()
            page_tag = bs_content.find(name="span", text=re.compile(r'当前为'))
            page_count = int(re.match(r'.*?共(\d+)页', page_tag.text).group(1))
            page_urls = map(
                lambda page_num: province_url[:-5] + "-" + str(page_num) +
                ".html", range(1, page_count + 1))
            #
            # for page_url in page_urls:
            #     thread_go(page_url)

            threading_pool = ThreadingPool()
            threading_pool.multi_thread(thread_go, page_urls)
Ejemplo n.º 5
0
                    orcl_conn = OracleSave(1000002)
                    orcl_conn.component_insert(component)
                    for key_value in property_key_values:
                        orcl_conn.properties_insert(key_value)
                    orcl_conn.commit()
                    orcl_conn.conn.close()

                    break
                except Exception as e:
                    print("database save exception", e)
                    count += 1
                    # if count > 3:
                    #     break

if __name__ == "__main__":
    mouser_go = MouserGo()
    # multi_category_structures = mouser_go.get_all_category()
    # mouser_go.category_to_csv(multi_category_structures)
    init_multi_category_trees = mouser_go.read_from_csv()

    # multi_category_trees = init_multi_category_trees[270:271]
    for i in range(271, 986, 5):
        multi_category_trees = init_multi_category_trees[i: i+5]

        pages_category = mouser_go.get_page_url(multi_category_trees)
    # print(pages_category)
    # for page_category in pages_category:
    #     mouser_go.page_thread_go(page_category)
        threadingpool = ThreadingPool(16)
        threadingpool.multi_process(mouser_go.page_thread_go, pages_category)
Ejemplo n.º 6
0
    def get_page_url(self, second_category):
        first_category_name, second_category_name, second_category_url = second_category
        while True:
            try:
                html_analyse = HtmlAnalyse(second_category_url,
                                           proxy=self.proxy_ip)
                bs_content = html_analyse.get_bs_contents()
                break
            except Exception as e:
                print(sys._getframe().f_code.co_name, e)
                self.proxy_ip = self.proxy_pool.get()

        ul_tag = bs_content.find(name="ul", attrs={"class": "brcategories"})
        third_category_tags = ul_tag.find_all(
            name="div", attrs={"class": "rsGARealEstate"})
        for third_category_tag in third_category_tags:
            third_category_name = third_category_tag.a.text
            third_category_url = Rs_Pre_Url + third_category_tag.a.get("href")

            while True:
                try:
                    html_analyse = HtmlAnalyse(third_category_url,
                                               proxy=self.proxy_ip)

                    bs_content = html_analyse.get_bs_contents()
                    break
                except Exception as e:
                    print(sys._getframe().f_code.co_name, e)
                    self.proxy_ip = self.proxy_pool.get()
            try:
                page_tag = bs_content.find(name="div",
                                           attrs={
                                               "class": "viewProdDiv"
                                           }).text
            except Exception as e:
                print(third_category_url, e, "找不到page_tag")
                continue
            flag = re.match(r".*?共(.*?)个", page_tag)
            page_count = int(int(flag.group(1).strip()) / 20 + 1)
            for page_num in range(int(page_count)):
                page_url = third_category_url + "?pn=" + str(page_num + 1)
                while True:
                    try:

                        html_analyse = HtmlAnalyse(page_url,
                                                   proxy=self.proxy_ip)
                        bs_content = html_analyse.get_bs_contents()
                        break
                    except Exception as e:
                        print(sys._getframe().f_code.co_name, e)
                        self.proxy_ip = self.proxy_pool.get()
                component_url_tags = bs_content.find_all(
                    name="a", attrs={"class": "tnProdDesc"})
                page_attributes = []
                for component_url_tag in component_url_tags:
                    component_url = Rs_Pre_Url + component_url_tag.get("href")
                    union_category_name = second_category_name + "---" + third_category_name
                    page_attribute = (first_category_name, union_category_name,
                                      component_url)
                    page_attributes.append(page_attribute)
                #
                threadingpool = ThreadingPool(4)
                threadingpool.multi_process(self.thread_go, page_attributes)

                # for page_attribute in page_attributes:
                #     self.thread_go(page_attribute)

            continue
Ejemplo n.º 7
0
                col.update({"corporation": key_word}, {'$set': {
                    "状态": "已完成"
                }},
                           multi=True)
                print(key_word, "已完成")
                conn.close()
                break
            except Exception as e:
                print(e)
                continue


# 470

if __name__ == "__main__":
    socket.setdefaulttimeout(30)
    mongo_conn = MongoClient("10.10.101.22", 27017)
    col = mongo_conn.spider.All_Company_Name
    search_list = SearchList()
    key_words = []
    for data in col.find({
            "状态": "未完成",
            "province_url": "http://www.soudh.com/province-6.html"
    }):
        key_word = data["corporation"]
        # search_list.get_all_urls(key_word)
        key_words.append(key_word)

    threadingpool = ThreadingPool(300)
    threadingpool.multi_process(search_list.get_all_urls, key_words)
Ejemplo n.º 8
0
        return [[i] for i in ls]
    else:
        j = int(ls_len / n)
        k = ls_len % n
        ### j,j,j,...(前面有n-1个j),j+k
        # 步长j,次数n-1
        ls_return = []
        for i in range(0, (n - 1) * j, j):
            ls_return.append(ls[i:i + j])
            # 算上末尾的j+k
        ls_return.append(ls[(n - 1) * j:])
        return ls_return


if __name__ == "__main__":
    while True:
        rows = get_component2()
        try:
            if len(rows) > 19:
                ls_return = div_list(rows, 20)
                threadingpool = ThreadingPool(4)
                threadingpool.multi_process(thread_go, ls_return)
            else:
                thread_go(rows)
        except Exception as e:
            print(e)
        if len(rows) == 0:
            break
            # for ls_ret in ls_return:
            #     thread_go(ls_ret)
Ejemplo n.º 9
0
            cursor.execute("delete from ac$us$detail where name='{}'".format(
                init_row[1]))
        elif row == init_row:
            cursor.execute(
                "update ac$us$detail set modifystatus=0 where name='{}'".
                format(init_row[1]))
        else:
            sql = "update ac$us$detail set adminname='{}',name='{}',shortname='{}',industry='{}',tel='{}',address='{}',type='{}',modifystatus=1 where name='{}'".format(
                *row, init_row[1].replace("'", "''"))
            cursor.execute(sql)
        print(row, init_row, (row == init_row))
        cursor.close()
        conn.commit()
        conn.close()


if __name__ == "__main__":
    while True:
        try:
            data_inside = DataInside()
            rows = data_inside.get_data()
            if not rows:
                break
            # for row in rows:
            #     data_inside.modify_data(row)

            threadingpool = ThreadingPool(8)
            threadingpool.multi_process(data_inside.modify_data, rows)

        except Exception as e:
            print(sys._getframe().f_code.co_name)
Ejemplo n.º 10
0
        else:
            while True:
                try:
                    col.update({"url": url}, {'$set': {"data": brief_companies, "状态": "已完成"}})
                    print(url, "已完成")
                    break
                except Exception as e:
                    print(e)
        conn.close()

        return


if __name__ == "__main__":
    socket.setdefaulttimeout(30)
    mongo_conn = MongoClient("10.10.101.22", 27017)
    col = mongo_conn.spider.All_Company_Info


    # detail_info.get_detail("http://www.tianyancha.com/company/2546208953")
    #
    urls = []
    for data in col.find({"状态": "未完成"}):
        url = data["url"]
        # detail_info.get_detail(url)
        urls.append(url)
    detail_info = DetailInfo()
    threadingpool = ThreadingPool(800)
    threadingpool.multi_process(detail_info.get_detail, urls)
# Valentine's Day
Ejemplo n.º 11
0
    def extra_go(self, category_tree):
        my_headers = Default_Header
        my_headers["host"] = "www.newark.com"
        my_headers["Referer"] = "http://www.newark.com/"
        my_headers["Upgrade-Insecure-Requests"] = "1"
        first_category_name = category_tree[0]
        second_category_name = str(category_tree[1:-2])
        url, component_count = category_tree[-2:]
        page_count = int(int(component_count) / 25) + 1
        page_range = range(875, 17557)

        def extra_thread(page_num):
            page_url = url + "/prl/results/" + str(page_num)
            count = 0
            while True:
                try:
                    self.my_session.headers.update(my_headers)
                    self.my_session.proxies.update(self.proxy_ip)
                    res = self.my_session.get(page_url, timeout=20)
                    if res.status_code != 200:
                        print(res.status_code)
                        self.proxy_pool.remove(self.proxy_ip)
                        self.proxy_ip = self.proxy_pool.get()
                        continue
                    bs_content = BeautifulSoup(res.content, "lxml")
                    component_tags = bs_content.find(
                        name="table", id="sProdList").tbody.find_all(name="tr")
                    break
                except Exception as e:
                    count += 1
                    print(sys._getframe().f_code.co_name, e)
                    self.proxy_ip = self.proxy_pool.get()
                    if count > 10:
                        print(category_tree, page_url)
                        component_tags = []
                        break

                    if count > 100:
                        self.proxy_pool._refresh()

            for component_tag in component_tags:
                detail_table = component_tag.find(name="table",
                                                  attrs={"class": "TFtable"})
                td_tags = component_tag.find_all(name="td")
                try:
                    component_code = td_tags[1].text.strip()
                except Exception as e:
                    print("component code is None", e)
                    continue
                try:
                    component_img = td_tags[1].find(name="img",
                                                    attrs={
                                                        "class":
                                                        "productThumbnail"
                                                    }).get("src")
                except:
                    component_img = ""
                try:
                    rough_attach = td_tags[2].find(name="a", text="数据表")
                    if not rough_attach:
                        rough_attach = td_tags[2].find(
                            name="a", attrs={"class": "prodDetailsAttachment"})
                    component_attach = rough_attach.get("href")
                    if "http" not in component_attach:
                        component_attach = ""
                except Exception as e:
                    component_attach = ""
                try:
                    manufacture_description = td_tags[3].a.find_all(name="p")
                    component_brand = manufacture_description[0].text.strip()
                    component_description = manufacture_description[
                        1].text.strip()
                except Exception as e:
                    component_brand = ""
                    print(sys._getframe().f_code.co_name, e)
                    continue
                if not component_img and not component_attach and not component_brand:
                    continue

                component = (component_code, component_brand,
                             first_category_name, second_category_name,
                             page_url, component_attach, component_img)
                count = 0
                while True:
                    try:
                        orcl_conn = OracleSave(1000003)
                        orcl_conn.component_insert(component)
                        if detail_table:
                            property_tags = detail_table.find_all(name="tr")
                            for property_tag in property_tags:
                                detail_td_tags = property_tag.find_all("td")
                                property_name = detail_td_tags[0].text.strip()
                                property_value = detail_td_tags[1].text.strip()
                                key_value = (property_name, property_value)
                                orcl_conn.properties_insert(key_value)
                        orcl_conn.commit()
                        orcl_conn.conn.close()

                        break
                    except Exception as e:
                        print(e)
                        count += 1
                        # if count > 3:
                        #     break

        extra_threading = ThreadingPool(8)
        extra_threading.multi_process(extra_thread, page_range)