def testPrintSeedDBSingleNiche(self):
     seed_db_addr = "/Users/superCat/Desktop/PycharmProjectPortable/Seeds/CategorySeedDB.db"
     parameters = {"TF": 0}
     db = CategorySeedSiteDB(seed_db_addr)
     total = db.get_total("Society/Law", **parameters)
     db.close()
     print(total)
예제 #2
0
def get_seeds_normal(categoy_db_addr: str, seed_limit: int, niche: str,
                     parameters: dict):
    db = CategorySeedSiteDB(categoy_db_addr)
    temp = [
        x.ref_domain for x in db.get_from_table(
            niche, 0, seed_limit, random_read=False, filter_dict=parameters)
    ]
    db.close()
    return temp
 def test_db_migration(self):
     old_seed_db_addr = "/Users/superCat/Desktop/PycharmProjectPortable/Seeds/CategorySeedDB.db"
     new_seed_db_addr = "/Users/superCat/Desktop/PycharmProjectPortable/Seeds/NewCategorySeedDB.db"
     old_db = CategorySeedSiteDB(old_seed_db_addr)
     new_db = CategorySeedSiteDB(new_seed_db_addr)
     old_tables = [x[0] for x in old_db.cur.execute("SELECT name FROM sqlite_master WHERE type='table';")]
     print('table length:', len(old_tables))
     for table in old_tables:
         print('doing table:', table)
         data = old_db.get_from_table(table, 0, 10000000, reverse_read=False, random_read=False)
         print('data len for table:', len(data))
         new_db.save_to_table(table, data)
     old_db.close()
     new_db.close()
 def test_db_migration(self):
     old_seed_db_addr = "/Users/superCat/Desktop/PycharmProjectPortable/Seeds/CategorySeedDB.db"
     new_seed_db_addr = "/Users/superCat/Desktop/PycharmProjectPortable/Seeds/NewCategorySeedDB.db"
     old_db = CategorySeedSiteDB(old_seed_db_addr)
     new_db = CategorySeedSiteDB(new_seed_db_addr)
     old_tables = [
         x[0] for x in old_db.cur.execute(
             "SELECT name FROM sqlite_master WHERE type='table';")
     ]
     print('table length:', len(old_tables))
     for table in old_tables:
         print('doing table:', table)
         data = old_db.get_from_table(table,
                                      0,
                                      10000000,
                                      reverse_read=False,
                                      random_read=False)
         print('data len for table:', len(data))
         new_db.save_to_table(table, data)
     old_db.close()
     new_db.close()
    def testImportSeeds0(self):
        seed_db_addr = "/Users/superCat/Desktop/PycharmProjectPortable/Seeds/NewCategorySeedDB.db"
        path = "/Users/superCat/Desktop/PycharmProjectPortable/Seeds/Gambling3.csv"
        db = CategorySeedSiteDB(seed_db_addr)
        with open(path, mode='rt') as csv_file:
            # lines = len(csv_file.readlines())
            rd = csv.reader(csv_file, delimiter=',')
            header = next(rd)  # skip header
            counter = 0
            temp = []
            while True:
                try:
                    row = next(rd)
                    if len(row) == 0:
                        break
                    if len(row) == 6:
                        domain, backlink, tf, cf, topic, topical_tf = row
                        print("current loc:", counter, "data:", row)
                        # if len(topic) > 0:
                        #     decoded_topic = basic_manager.decode_sub_category(topic, False)
                        data = MajesticBacklinkDataStruct(
                            ref_domain=domain,
                            src_cf=int(cf),
                            src_tf=int(tf),
                            src_topical_tf=int(topical_tf))
                        temp.append(data)
                except StopIteration:
                    print('stop iteration')
                    break
                except Exception as ex:

                    print("exception:", str(ex), "row:", str(counter))
                    if len(str(ex)) == 0:
                        break
                finally:
                    counter += 1

        db.save_to_table('Games/Gambling', temp)
        db.close()
예제 #6
0
    def testSeedsUpload2(self) -> int:
        categoy_db_addr = "/Users/superCat/Desktop/PycharmProjectPortable/Seeds/NewCategorySeedDB.db"
        # categoy_db_addr = "/Users/superCat/Desktop/PycharmProjectPortable/Seeds/CategorySeedDB2.db"
        target_ca = {
            "Games/Gambling": 62000,
            # "Society/Politics":20000,
            # "Society/Issues":20000,
            # "Business/Financial Services":20000,
        }

        # seeds_needed = 20000
        total_seeds = 0
        parameters = {"TF": 0}
        db = CategorySeedSiteDB(categoy_db_addr)
        for ca, seeds_needed in target_ca.items():
            sites = [
                x.ref_domain for x in db.get_from_table(ca,
                                                        68000,
                                                        seeds_needed,
                                                        random_read=False,
                                                        reverse_read=False,
                                                        filter_dict=parameters)
            ]
            # sites = get_seeds_normal(categoy_db_addr, seeds_needed, ca, parameters)
            total_seeds += len(sites)
            print("doing site:", ca, " size:", len(sites))
            in_data = MiningList(ref=seed_db_name, data=sites)
            ser = get_server()
            hostController = HostController(ser,
                                            cmd=ServerCommand.Com_Add_Seed,
                                            in_data=in_data)
            hostController.start()
            hostController.join()
        db.close()
        if len(target_ca) > 1:
            return total_seeds * 0.97
        else:
            return total_seeds
    def testGetSeedsFromBacklinks(self):
        import random
        import time
        logging_path = "/Users/superCat/Desktop/PycharmProjectPortable/Seeds/GeneralSeed5.csv"
        # seed_db_addr = "/Users/superCat/Desktop/PycharmProjectPortable/Seeds/CategorySeedDB_WithCountry_Temp.db"
        seed_db_addr = "/Users/superCat/Desktop/PycharmProjectPortable/Seeds/CategorySeedDB3.db"
        # logging_path = "/Users/superCat/Desktop/PycharmProjectPortable/Seeds/GeneralSeed4.csv"
        # seed_db_addr = "/Users/superCat/Desktop/PycharmProjectPortable/Seeds/CategorySeedDB.db"
        save_seed_db_addr = "/Users/superCat/Desktop/PycharmProjectPortable/Seeds/CategorySeedDB3.db"
        category_db_addr = "/Users/superCat/Desktop/PycharmProjectPortable/test/CategoryDB.db"
        seed_site_file_path = "/Users/superCat/Desktop/PycharmProjectPortable/Seeds/SiteFromResults.txt"
        db = CategorySeedSiteDB(seed_db_addr)
        basic_manager = CategoryManager()
        thread_pool_size = 20
        max_count = 5000

        category_manager = CategoryDBManager(category_db_addr)
        seed_manager = CategorySiteDBManager(
            CategorySeedSiteDB, db_path=save_seed_db_addr)  # was seed_db_addr
        seed_manager._max_site_limit = int(thread_pool_size * max_count * 0.75)

        counter = 0
        country_file_path = "/Users/superCat/Desktop/PycharmProjectPortable/SpamFilter/bad_country.txt"
        bad_countries = [
            x.upper()
            for x in FileIO.FileHandler.read_lines_from_file(country_file_path)
        ]

        def backlink_callback_inner(link_data):

            if isinstance(link_data, MajesticRefDomainStruct):
                if link_data.country in bad_countries or link_data.tf < 5 or link_data.tf > 95:
                    link_data = None
                    pass
                else:
                    link_data = MajesticBacklinkDataStruct(
                        ref_domain=link_data.domain,
                        backlink=link_data.domain,
                        src_tf=link_data.tf,
                        src_cf=link_data.cf,
                        src_topic=link_data.src_topic,
                        src_topical_tf=link_data.src_topic_tf,
                        country_code=link_data.country,
                        potential_url=link_data.potential_url)

            if isinstance(link_data, MajesticBacklinkDataStruct):
                if len(link_data.src_topic) > 1:
                    decoded_topic = basic_manager.decode_sub_category(
                        link_data.src_topic, False)
                    # print(backlink)
                    Logging.CsvLogger.log_to_file_path(logging_path, [
                        link_data.to_tuple(),
                    ])
                    seed_manager.append_to_buff(data=link_data,
                                                category=str(decoded_topic))

        total_count = 0
        seed_init_limit = 400
        seed_depth_limit = 3000
        temp_niches = []
        niches = []

        for niche in temp_niches:  # make valid niche for seeds
            # if niche.endswith("General"):
            #     niches.append(niche.rstrip("General"))
            # else:
            niches.append(niche)

        forbidden_list = [
            "bbc.co.uk", "wikipedia.org", "youtube.com", "amazon.co.uk",
            "facebook.com", "google.com", ".ru", ".cn", ".jp"
        ]
        for niche in niches:
            decoded_topic = basic_manager.decode_sub_category(niche, False)
            print(decoded_topic)
        minimum_tf = 25
        temp_sites = []
        target_ca = [
            "Society/Law", "Society/Politics", "Society/Issues",
            "Business/Financial Services", "Society/Government"
        ]
        sites = []
        parameters = {"TF": minimum_tf}
        key_words = [
            "Alcohol law", "Banking law", "Antitrust law", "Aviation law",
            "Corporate law", "Communications law", "Construction law",
            "Consumer law", "Drug control law", "Insurance law", "Tax law"
        ]
        # for item in key_words:
        #     temp_sites += GoogleCom.get_sites(keyword=item, index=0, filter_list=forbidden_list, blog=True)[0:]
        #     print("sites count:", len(temp_sites))
        #     time.sleep(2)
        # temp_sites = FileHandler.read_lines_from_file(seed_site_file_path)
        # temp_sites = list(set(temp_sites))
        print("seeds total:", len(temp_sites))
        categories = db.get_sub_category_tables_name()
        for niche in niches:
            target_ca += [x for x in categories if niche in x]

        seed_count = 0
        load_limit = seed_init_limit * 4

        def check_ending(domain: str):
            is_wrong_ending = False
            for item in forbidden_list:
                if domain.endswith(item):
                    is_wrong_ending = True
                    break
            return not is_wrong_ending

        for ca in target_ca:
            temp_sites += [
                y for y in filter(check_ending, [
                    x.ref_domain for x in db.get_from_table(ca,
                                                            0,
                                                            load_limit,
                                                            parameters,
                                                            reverse_read=True,
                                                            random_read=True)
                ])
            ]

        db.close()

        seed_count = len(temp_sites)
        # seed_init_limit = seed_count  #---------------------------

        if seed_count <= seed_init_limit:
            sites = temp_sites
        elif seed_init_limit < seed_count <= seed_init_limit * 2:
            sites = temp_sites[::2]
        else:
            while len(sites) < seed_init_limit:
                site = temp_sites[random.randint(0, seed_count - 1)]
                if site not in sites:
                    sites.append(site)
        # GoogleMajestic.get_sites_by_seed_sites(majestic, sites, catagories=niches, iteration=1,
        #                                                    count_per_domain=max_count, callback=backlink_callback_inner,
        #                                                    max_count=seed_depth_limit, tf=minimum_tf)
        GoogleMajestic.get_sites_by_seed_sites_muti_threads(
            majestic,
            sites,
            catagories=target_ca,
            iteration=4,
            count_per_domain=max_count,
            callback=backlink_callback_inner,
            max_count=seed_depth_limit + seed_init_limit,
            thread_pool_size=thread_pool_size,
            tf=minimum_tf,
            get_backlinks=False,
            bad_country_list=bad_countries)
        seed_manager.close()