예제 #1
0
def get_seeds(categoy_db_addr: str, seed_limit: int, niche: str,
              parameters: dict):
    db = CategorySeedSiteDB(categoy_db_addr)
    # if niche.endswith("General"):
    #     niche = niche.rstrip("General")

    temp_sites = []
    target_ca = []
    sites = []
    categories = db.get_sub_category_tables_name()
    target_ca += [x for x in categories if niche in x]

    load_limit = seed_limit * 5
    for ca in target_ca:
        print("getting seeds from:", ca)
        temp_sites += [
            x.ref_domain for x in db.get_from_table(
                ca, 0, load_limit, random_read=True, filter_dict=parameters)
        ]

    seed_count = len(temp_sites)

    if seed_count <= seed_limit:
        sites = temp_sites
    elif seed_limit < seed_count <= seed_limit * 2:
        sites = temp_sites[::2]
    else:
        while len(sites) < seed_limit:
            site = temp_sites[random.randint(0, seed_count - 1)]
            if site not in sites:
                sites.append(site)
    return sites
    def testeedExport(self):
        seed_db_addr = "/Users/superCat/Desktop/PycharmProjectPortable/sync/SeedSitesList"
        seed_db = SeedSiteDB("26/10/2015 Marketing CF20", db_addr=seed_db_addr)

        categoy_db_addr = "/Users/superCat/Desktop/PycharmProjectPortable/Seeds/CategorySeedDB.db"
        db = CategorySeedSiteDB(categoy_db_addr)
        # seed_manager = CategorySiteDBManager(CategorySeedSiteDB, db_path=categoy_db_addr)
        categories = db.get_sub_category_tables_name()
        target_ca = [
            x for x in categories if "Business/Marketing and Advertising" in x
        ]
        sites = []
        seeds_needed = 20000
        percentage = 1
        parameters = {
            "CF": 20,
        }
        for ca in target_ca:
            sites.clear()
            count = db.get_total(ca)
            if percentage == 1 and count > seeds_needed:
                count = seeds_needed
            count = int(percentage * count)
            if count > 0:
                temp = db.get_from_table(ca,
                                         0,
                                         count,
                                         random_read=False,
                                         filter_dict=parameters)
                for item in temp:
                    if isinstance(item, MajesticBacklinkDataStruct):
                        sites.append((item.ref_domain, 0))
                seed_db.add_sites(sites, skip_check=True)
        seed_db.close()
예제 #3
0
def get_seeds_normal(categoy_db_addr: str, seed_limit: int, niche: str,
                     parameters: dict):
    db = CategorySeedSiteDB(categoy_db_addr)
    temp = [
        x.ref_domain for x in db.get_from_table(
            niche, 0, seed_limit, random_read=False, filter_dict=parameters)
    ]
    db.close()
    return temp
 def test_merge_db(self):
     merge_from = "/Users/superCat/Desktop/PycharmProjectPortable/Seeds/CategorySeedDB2.db"
     merge_to = "/Users/superCat/Desktop/PycharmProjectPortable/Seeds/CategorySeedDB.db"
     db_from = CategorySeedSiteDB(merge_from)
     db_to = CategorySeedSiteDB(merge_to)
     from_cat = db_from.get_sub_category_tables_name()
     for item in from_cat:
         results = db_from.get_from_table(item, 0, 10000000, reverse_read=False, random_read=False)
         print("adding:", item, "result:", len(results))
         db_to.save_to_table(item, results)
 def test_merge_db(self):
     merge_from = "/Users/superCat/Desktop/PycharmProjectPortable/Seeds/CategorySeedDB2.db"
     merge_to = "/Users/superCat/Desktop/PycharmProjectPortable/Seeds/CategorySeedDB.db"
     db_from = CategorySeedSiteDB(merge_from)
     db_to = CategorySeedSiteDB(merge_to)
     from_cat = db_from.get_sub_category_tables_name()
     for item in from_cat:
         results = db_from.get_from_table(item,
                                          0,
                                          10000000,
                                          reverse_read=False,
                                          random_read=False)
         print("adding:", item, "result:", len(results))
         db_to.save_to_table(item, results)
 def test_db_migration(self):
     old_seed_db_addr = "/Users/superCat/Desktop/PycharmProjectPortable/Seeds/CategorySeedDB.db"
     new_seed_db_addr = "/Users/superCat/Desktop/PycharmProjectPortable/Seeds/NewCategorySeedDB.db"
     old_db = CategorySeedSiteDB(old_seed_db_addr)
     new_db = CategorySeedSiteDB(new_seed_db_addr)
     old_tables = [x[0] for x in old_db.cur.execute("SELECT name FROM sqlite_master WHERE type='table';")]
     print('table length:', len(old_tables))
     for table in old_tables:
         print('doing table:', table)
         data = old_db.get_from_table(table, 0, 10000000, reverse_read=False, random_read=False)
         print('data len for table:', len(data))
         new_db.save_to_table(table, data)
     old_db.close()
     new_db.close()
 def test_db_migration(self):
     old_seed_db_addr = "/Users/superCat/Desktop/PycharmProjectPortable/Seeds/CategorySeedDB.db"
     new_seed_db_addr = "/Users/superCat/Desktop/PycharmProjectPortable/Seeds/NewCategorySeedDB.db"
     old_db = CategorySeedSiteDB(old_seed_db_addr)
     new_db = CategorySeedSiteDB(new_seed_db_addr)
     old_tables = [
         x[0] for x in old_db.cur.execute(
             "SELECT name FROM sqlite_master WHERE type='table';")
     ]
     print('table length:', len(old_tables))
     for table in old_tables:
         print('doing table:', table)
         data = old_db.get_from_table(table,
                                      0,
                                      10000000,
                                      reverse_read=False,
                                      random_read=False)
         print('data len for table:', len(data))
         new_db.save_to_table(table, data)
     old_db.close()
     new_db.close()
예제 #8
0
    def testSeedsUpload2(self) -> int:
        categoy_db_addr = "/Users/superCat/Desktop/PycharmProjectPortable/Seeds/NewCategorySeedDB.db"
        # categoy_db_addr = "/Users/superCat/Desktop/PycharmProjectPortable/Seeds/CategorySeedDB2.db"
        target_ca = {
            "Games/Gambling": 62000,
            # "Society/Politics":20000,
            # "Society/Issues":20000,
            # "Business/Financial Services":20000,
        }

        # seeds_needed = 20000
        total_seeds = 0
        parameters = {"TF": 0}
        db = CategorySeedSiteDB(categoy_db_addr)
        for ca, seeds_needed in target_ca.items():
            sites = [
                x.ref_domain for x in db.get_from_table(ca,
                                                        68000,
                                                        seeds_needed,
                                                        random_read=False,
                                                        reverse_read=False,
                                                        filter_dict=parameters)
            ]
            # sites = get_seeds_normal(categoy_db_addr, seeds_needed, ca, parameters)
            total_seeds += len(sites)
            print("doing site:", ca, " size:", len(sites))
            in_data = MiningList(ref=seed_db_name, data=sites)
            ser = get_server()
            hostController = HostController(ser,
                                            cmd=ServerCommand.Com_Add_Seed,
                                            in_data=in_data)
            hostController.start()
            hostController.join()
        db.close()
        if len(target_ca) > 1:
            return total_seeds * 0.97
        else:
            return total_seeds
    def testGetSeedsFromBacklinks(self):
        import random
        import time
        logging_path = "/Users/superCat/Desktop/PycharmProjectPortable/Seeds/GeneralSeed5.csv"
        # seed_db_addr = "/Users/superCat/Desktop/PycharmProjectPortable/Seeds/CategorySeedDB_WithCountry_Temp.db"
        seed_db_addr = "/Users/superCat/Desktop/PycharmProjectPortable/Seeds/CategorySeedDB3.db"
        # logging_path = "/Users/superCat/Desktop/PycharmProjectPortable/Seeds/GeneralSeed4.csv"
        # seed_db_addr = "/Users/superCat/Desktop/PycharmProjectPortable/Seeds/CategorySeedDB.db"
        save_seed_db_addr = "/Users/superCat/Desktop/PycharmProjectPortable/Seeds/CategorySeedDB3.db"
        category_db_addr = "/Users/superCat/Desktop/PycharmProjectPortable/test/CategoryDB.db"
        seed_site_file_path = "/Users/superCat/Desktop/PycharmProjectPortable/Seeds/SiteFromResults.txt"
        db = CategorySeedSiteDB(seed_db_addr)
        basic_manager = CategoryManager()
        thread_pool_size = 20
        max_count = 5000

        category_manager = CategoryDBManager(category_db_addr)
        seed_manager = CategorySiteDBManager(
            CategorySeedSiteDB, db_path=save_seed_db_addr)  # was seed_db_addr
        seed_manager._max_site_limit = int(thread_pool_size * max_count * 0.75)

        counter = 0
        country_file_path = "/Users/superCat/Desktop/PycharmProjectPortable/SpamFilter/bad_country.txt"
        bad_countries = [
            x.upper()
            for x in FileIO.FileHandler.read_lines_from_file(country_file_path)
        ]

        def backlink_callback_inner(link_data):

            if isinstance(link_data, MajesticRefDomainStruct):
                if link_data.country in bad_countries or link_data.tf < 5 or link_data.tf > 95:
                    link_data = None
                    pass
                else:
                    link_data = MajesticBacklinkDataStruct(
                        ref_domain=link_data.domain,
                        backlink=link_data.domain,
                        src_tf=link_data.tf,
                        src_cf=link_data.cf,
                        src_topic=link_data.src_topic,
                        src_topical_tf=link_data.src_topic_tf,
                        country_code=link_data.country,
                        potential_url=link_data.potential_url)

            if isinstance(link_data, MajesticBacklinkDataStruct):
                if len(link_data.src_topic) > 1:
                    decoded_topic = basic_manager.decode_sub_category(
                        link_data.src_topic, False)
                    # print(backlink)
                    Logging.CsvLogger.log_to_file_path(logging_path, [
                        link_data.to_tuple(),
                    ])
                    seed_manager.append_to_buff(data=link_data,
                                                category=str(decoded_topic))

        total_count = 0
        seed_init_limit = 400
        seed_depth_limit = 3000
        temp_niches = []
        niches = []

        for niche in temp_niches:  # make valid niche for seeds
            # if niche.endswith("General"):
            #     niches.append(niche.rstrip("General"))
            # else:
            niches.append(niche)

        forbidden_list = [
            "bbc.co.uk", "wikipedia.org", "youtube.com", "amazon.co.uk",
            "facebook.com", "google.com", ".ru", ".cn", ".jp"
        ]
        for niche in niches:
            decoded_topic = basic_manager.decode_sub_category(niche, False)
            print(decoded_topic)
        minimum_tf = 25
        temp_sites = []
        target_ca = [
            "Society/Law", "Society/Politics", "Society/Issues",
            "Business/Financial Services", "Society/Government"
        ]
        sites = []
        parameters = {"TF": minimum_tf}
        key_words = [
            "Alcohol law", "Banking law", "Antitrust law", "Aviation law",
            "Corporate law", "Communications law", "Construction law",
            "Consumer law", "Drug control law", "Insurance law", "Tax law"
        ]
        # for item in key_words:
        #     temp_sites += GoogleCom.get_sites(keyword=item, index=0, filter_list=forbidden_list, blog=True)[0:]
        #     print("sites count:", len(temp_sites))
        #     time.sleep(2)
        # temp_sites = FileHandler.read_lines_from_file(seed_site_file_path)
        # temp_sites = list(set(temp_sites))
        print("seeds total:", len(temp_sites))
        categories = db.get_sub_category_tables_name()
        for niche in niches:
            target_ca += [x for x in categories if niche in x]

        seed_count = 0
        load_limit = seed_init_limit * 4

        def check_ending(domain: str):
            is_wrong_ending = False
            for item in forbidden_list:
                if domain.endswith(item):
                    is_wrong_ending = True
                    break
            return not is_wrong_ending

        for ca in target_ca:
            temp_sites += [
                y for y in filter(check_ending, [
                    x.ref_domain for x in db.get_from_table(ca,
                                                            0,
                                                            load_limit,
                                                            parameters,
                                                            reverse_read=True,
                                                            random_read=True)
                ])
            ]

        db.close()

        seed_count = len(temp_sites)
        # seed_init_limit = seed_count  #---------------------------

        if seed_count <= seed_init_limit:
            sites = temp_sites
        elif seed_init_limit < seed_count <= seed_init_limit * 2:
            sites = temp_sites[::2]
        else:
            while len(sites) < seed_init_limit:
                site = temp_sites[random.randint(0, seed_count - 1)]
                if site not in sites:
                    sites.append(site)
        # GoogleMajestic.get_sites_by_seed_sites(majestic, sites, catagories=niches, iteration=1,
        #                                                    count_per_domain=max_count, callback=backlink_callback_inner,
        #                                                    max_count=seed_depth_limit, tf=minimum_tf)
        GoogleMajestic.get_sites_by_seed_sites_muti_threads(
            majestic,
            sites,
            catagories=target_ca,
            iteration=4,
            count_per_domain=max_count,
            callback=backlink_callback_inner,
            max_count=seed_depth_limit + seed_init_limit,
            thread_pool_size=thread_pool_size,
            tf=minimum_tf,
            get_backlinks=False,
            bad_country_list=bad_countries)
        seed_manager.close()