def testPrintSeedDBSingleNiche(self):
     seed_db_addr = "/Users/superCat/Desktop/PycharmProjectPortable/Seeds/CategorySeedDB.db"
     parameters = {"TF": 0}
     db = CategorySeedSiteDB(seed_db_addr)
     total = db.get_total("Society/Law", **parameters)
     db.close()
     print(total)
    def testeedExport(self):
        seed_db_addr = "/Users/superCat/Desktop/PycharmProjectPortable/sync/SeedSitesList"
        seed_db = SeedSiteDB("26/10/2015 Marketing CF20", db_addr=seed_db_addr)

        categoy_db_addr = "/Users/superCat/Desktop/PycharmProjectPortable/Seeds/CategorySeedDB.db"
        db = CategorySeedSiteDB(categoy_db_addr)
        # seed_manager = CategorySiteDBManager(CategorySeedSiteDB, db_path=categoy_db_addr)
        categories = db.get_sub_category_tables_name()
        target_ca = [
            x for x in categories if "Business/Marketing and Advertising" in x
        ]
        sites = []
        seeds_needed = 20000
        percentage = 1
        parameters = {
            "CF": 20,
        }
        for ca in target_ca:
            sites.clear()
            count = db.get_total(ca)
            if percentage == 1 and count > seeds_needed:
                count = seeds_needed
            count = int(percentage * count)
            if count > 0:
                temp = db.get_from_table(ca,
                                         0,
                                         count,
                                         random_read=False,
                                         filter_dict=parameters)
                for item in temp:
                    if isinstance(item, MajesticBacklinkDataStruct):
                        sites.append((item.ref_domain, 0))
                seed_db.add_sites(sites, skip_check=True)
        seed_db.close()
예제 #3
0
def get_seeds(categoy_db_addr: str, seed_limit: int, niche: str,
              parameters: dict):
    db = CategorySeedSiteDB(categoy_db_addr)
    # if niche.endswith("General"):
    #     niche = niche.rstrip("General")

    temp_sites = []
    target_ca = []
    sites = []
    categories = db.get_sub_category_tables_name()
    target_ca += [x for x in categories if niche in x]

    load_limit = seed_limit * 5
    for ca in target_ca:
        print("getting seeds from:", ca)
        temp_sites += [
            x.ref_domain for x in db.get_from_table(
                ca, 0, load_limit, random_read=True, filter_dict=parameters)
        ]

    seed_count = len(temp_sites)

    if seed_count <= seed_limit:
        sites = temp_sites
    elif seed_limit < seed_count <= seed_limit * 2:
        sites = temp_sites[::2]
    else:
        while len(sites) < seed_limit:
            site = temp_sites[random.randint(0, seed_count - 1)]
            if site not in sites:
                sites.append(site)
    return sites
 def testPrintSeedDB(self):
     seed_db_addr = "/Users/superCat/Desktop/PycharmProjectPortable/Seeds/CategorySeedDB3.db"
     log_file_path = "/Users/superCat/Desktop/PycharmProjectPortable/Seeds/SeedLog3.csv"
     enable_log = True
     FileHandler.remove_file_if_exist(log_file_path)
     db = CategorySeedSiteDB(seed_db_addr)
     # seed_manager = CategorySiteDBManager(CategorySeedSiteDB, db_path=seed_db_addr)
     categories = db.get_sub_category_tables_name()
     total_count = 0
     target_niche = ""
     parameters = {"TF": 0}
     if enable_log:
         CsvLogger.log_to_file_path(log_file_path, [
             ("parameters", str(parameters)),
         ])
     # parameters = {"TF": 20}
     for item in categories:
         if target_niche in item or len(target_niche) == 0:
             count = db.get_total(item, **parameters)
             total_count += count
             print(item, "  ", count)
             if enable_log:
                 CsvLogger.log_to_file_path(log_file_path, [
                     (item, str(count)),
                 ])
     print("total:", total_count)
     if enable_log:
         CsvLogger.log_to_file_path(log_file_path, [
             ("total", str(total_count)),
         ])
예제 #5
0
 def testSeedsUpload_General(self) -> int:
     categoy_db_addr = "/Users/superCat/Desktop/PycharmProjectPortable/Seeds/CategorySeedDB3.db"
     # categoy_db_addr = "/Users/superCat/Desktop/PycharmProjectPortable/Seeds/CategorySeedDB2.db"
     db = CategorySeedSiteDB(categoy_db_addr)
     categories = db.get_sub_category_tables_name()
     forbidden_niche = ['Adult/', 'Gambling', 'Law', 'Directory']
     target_ca = {}
     seed_count = 200
     for item in categories:
         if not any(x in item for x in forbidden_niche):
             target_ca.update({item: seed_count})
     # seeds_needed = 20000
     total_seeds = 0
     in_data = MiningList(ref=seed_db_name, data=[])
     parameters = {"TF": 5}
     counter = 0
     for ca, seeds_needed in target_ca.items():
         sites = get_seeds_normal(categoy_db_addr, seeds_needed, ca,
                                  parameters)
         total_seeds += len(sites)
         print(counter, " doing site:", ca, " size:", len(sites), " total:",
               total_seeds)
         in_data.data += sites
         counter += 1
     ser = get_server()
     hostController = HostController(ser,
                                     cmd=ServerCommand.Com_Add_Seed,
                                     in_data=in_data)
     hostController.start()
     hostController.join()
     if len(target_ca) > 1:
         return total_seeds * 0.97
     else:
         return total_seeds
예제 #6
0
def get_seeds_normal(categoy_db_addr: str, seed_limit: int, niche: str,
                     parameters: dict):
    db = CategorySeedSiteDB(categoy_db_addr)
    temp = [
        x.ref_domain for x in db.get_from_table(
            niche, 0, seed_limit, random_read=False, filter_dict=parameters)
    ]
    db.close()
    return temp
 def testImportSeeds(self):
     seed_db_addr = "/Users/superCat/Desktop/PycharmProjectPortable/Seeds/CategorySeedDB.db"
     category_db_addr = "/Users/superCat/Desktop/PycharmProjectPortable/test/CategoryDB.db"
     db = CategorySeedSiteDB(seed_db_addr)
     basic_manager = CategoryManager()
     category_manager = CategoryDBManager(category_db_addr)
     seed_manager = CategorySiteDBManager(CategorySeedSiteDB,
                                          db_path=seed_db_addr)
     import csv
     path = "/Users/superCat/Desktop/PycharmProjectPortable/Seeds/Gambling3.csv"
     counter = 0
     with open(path, mode='r', newline='', encoding='utf-8') as csv_file:
         # lines = len(csv_file.readlines())
         rd = csv.reader(csv_file, delimiter=',')
         for row in rd:
             try:
                 if len(row) == 6:
                     domain, backlink, tf, cf, topic, topical_tf = row
                     if len(topic) > 0:
                         decoded_topic = basic_manager.decode_sub_category(
                             topic, False)
                         data = MajesticBacklinkDataStruct(
                             ref_domain=domain,
                             src_cf=int(cf),
                             src_tf=int(tf),
                             src_topic=str(decoded_topic),
                             src_topical_tf=int(topical_tf))
                         seed_manager.append_to_buff(
                             data=data, category=str(decoded_topic))
             except Exception as ex:
                 print(ex, "row:", row)
             finally:
                 counter += 1
                 print("current loc:", counter, "data:", row)
     seed_manager.close()
 def test_db_migration(self):
     old_seed_db_addr = "/Users/superCat/Desktop/PycharmProjectPortable/Seeds/CategorySeedDB.db"
     new_seed_db_addr = "/Users/superCat/Desktop/PycharmProjectPortable/Seeds/NewCategorySeedDB.db"
     old_db = CategorySeedSiteDB(old_seed_db_addr)
     new_db = CategorySeedSiteDB(new_seed_db_addr)
     old_tables = [x[0] for x in old_db.cur.execute("SELECT name FROM sqlite_master WHERE type='table';")]
     print('table length:', len(old_tables))
     for table in old_tables:
         print('doing table:', table)
         data = old_db.get_from_table(table, 0, 10000000, reverse_read=False, random_read=False)
         print('data len for table:', len(data))
         new_db.save_to_table(table, data)
     old_db.close()
     new_db.close()
 def test_merge_db(self):
     merge_from = "/Users/superCat/Desktop/PycharmProjectPortable/Seeds/CategorySeedDB2.db"
     merge_to = "/Users/superCat/Desktop/PycharmProjectPortable/Seeds/CategorySeedDB.db"
     db_from = CategorySeedSiteDB(merge_from)
     db_to = CategorySeedSiteDB(merge_to)
     from_cat = db_from.get_sub_category_tables_name()
     for item in from_cat:
         results = db_from.get_from_table(item,
                                          0,
                                          10000000,
                                          reverse_read=False,
                                          random_read=False)
         print("adding:", item, "result:", len(results))
         db_to.save_to_table(item, results)
예제 #10
0
    def testSeedsUpload2(self) -> int:
        categoy_db_addr = "/Users/superCat/Desktop/PycharmProjectPortable/Seeds/NewCategorySeedDB.db"
        # categoy_db_addr = "/Users/superCat/Desktop/PycharmProjectPortable/Seeds/CategorySeedDB2.db"
        target_ca = {
            "Games/Gambling": 62000,
            # "Society/Politics":20000,
            # "Society/Issues":20000,
            # "Business/Financial Services":20000,
        }

        # seeds_needed = 20000
        total_seeds = 0
        parameters = {"TF": 0}
        db = CategorySeedSiteDB(categoy_db_addr)
        for ca, seeds_needed in target_ca.items():
            sites = [
                x.ref_domain for x in db.get_from_table(ca,
                                                        68000,
                                                        seeds_needed,
                                                        random_read=False,
                                                        reverse_read=False,
                                                        filter_dict=parameters)
            ]
            # sites = get_seeds_normal(categoy_db_addr, seeds_needed, ca, parameters)
            total_seeds += len(sites)
            print("doing site:", ca, " size:", len(sites))
            in_data = MiningList(ref=seed_db_name, data=sites)
            ser = get_server()
            hostController = HostController(ser,
                                            cmd=ServerCommand.Com_Add_Seed,
                                            in_data=in_data)
            hostController.start()
            hostController.join()
        db.close()
        if len(target_ca) > 1:
            return total_seeds * 0.97
        else:
            return total_seeds
 def test_merge_db(self):
     merge_from = "/Users/superCat/Desktop/PycharmProjectPortable/Seeds/CategorySeedDB2.db"
     merge_to = "/Users/superCat/Desktop/PycharmProjectPortable/Seeds/CategorySeedDB.db"
     db_from = CategorySeedSiteDB(merge_from)
     db_to = CategorySeedSiteDB(merge_to)
     from_cat = db_from.get_sub_category_tables_name()
     for item in from_cat:
         results = db_from.get_from_table(item, 0, 10000000, reverse_read=False, random_read=False)
         print("adding:", item, "result:", len(results))
         db_to.save_to_table(item, results)
예제 #12
0
    def testUpload1(self):
        sock = StreamSocket()
        processor = CommandProcessor(sock.rfile, sock.wfile)

        seed_db_addr = "/Users/superCat/Desktop/PycharmProjectPortable/sync/SeedSitesList"
        seed_db = SeedSiteDB("09/11/2015", db_addr=seed_db_addr)

        categoy_db_addr = "/Users/superCat/Desktop/PycharmProjectPortable/Seeds/CategorySeedDB.db"
        db = CategorySeedSiteDB(categoy_db_addr)

        target_ca = [
            "Health/General", "Health/Nutrition", "Shopping/Clothing",
            "Computers/Internet/Web Design and Development", "Society/People",
            "Home/Gardening", "Computers/Hardware", "Recreation/Food"
        ]
        seeds_needed = 5000
        parameters = {"CF": 0}
        for ca in target_ca:
            sites = get_seeds(categoy_db_addr, seeds_needed, ca, parameters)
            print("doing site:", ca, " size:", len(sites))
            seed_db.add_sites(sites, skip_check=False)
        seed_db.close()
    def testImportSeeds0(self):
        seed_db_addr = "/Users/superCat/Desktop/PycharmProjectPortable/Seeds/NewCategorySeedDB.db"
        path = "/Users/superCat/Desktop/PycharmProjectPortable/Seeds/Gambling3.csv"
        db = CategorySeedSiteDB(seed_db_addr)
        with open(path, mode='rt') as csv_file:
            # lines = len(csv_file.readlines())
            rd = csv.reader(csv_file, delimiter=',')
            header = next(rd)  # skip header
            counter = 0
            temp = []
            while True:
                try:
                    row = next(rd)
                    if len(row) == 0:
                        break
                    if len(row) == 6:
                        domain, backlink, tf, cf, topic, topical_tf = row
                        print("current loc:", counter, "data:", row)
                        # if len(topic) > 0:
                        #     decoded_topic = basic_manager.decode_sub_category(topic, False)
                        data = MajesticBacklinkDataStruct(
                            ref_domain=domain,
                            src_cf=int(cf),
                            src_tf=int(tf),
                            src_topical_tf=int(topical_tf))
                        temp.append(data)
                except StopIteration:
                    print('stop iteration')
                    break
                except Exception as ex:

                    print("exception:", str(ex), "row:", str(counter))
                    if len(str(ex)) == 0:
                        break
                finally:
                    counter += 1

        db.save_to_table('Games/Gambling', temp)
        db.close()
 def test_db_migration(self):
     old_seed_db_addr = "/Users/superCat/Desktop/PycharmProjectPortable/Seeds/CategorySeedDB.db"
     new_seed_db_addr = "/Users/superCat/Desktop/PycharmProjectPortable/Seeds/NewCategorySeedDB.db"
     old_db = CategorySeedSiteDB(old_seed_db_addr)
     new_db = CategorySeedSiteDB(new_seed_db_addr)
     old_tables = [
         x[0] for x in old_db.cur.execute(
             "SELECT name FROM sqlite_master WHERE type='table';")
     ]
     print('table length:', len(old_tables))
     for table in old_tables:
         print('doing table:', table)
         data = old_db.get_from_table(table,
                                      0,
                                      10000000,
                                      reverse_read=False,
                                      random_read=False)
         print('data len for table:', len(data))
         new_db.save_to_table(table, data)
     old_db.close()
     new_db.close()
    def testGetSeedsFromBacklinks(self):
        import random
        import time
        logging_path = "/Users/superCat/Desktop/PycharmProjectPortable/Seeds/GeneralSeed5.csv"
        # seed_db_addr = "/Users/superCat/Desktop/PycharmProjectPortable/Seeds/CategorySeedDB_WithCountry_Temp.db"
        seed_db_addr = "/Users/superCat/Desktop/PycharmProjectPortable/Seeds/CategorySeedDB3.db"
        # logging_path = "/Users/superCat/Desktop/PycharmProjectPortable/Seeds/GeneralSeed4.csv"
        # seed_db_addr = "/Users/superCat/Desktop/PycharmProjectPortable/Seeds/CategorySeedDB.db"
        save_seed_db_addr = "/Users/superCat/Desktop/PycharmProjectPortable/Seeds/CategorySeedDB3.db"
        category_db_addr = "/Users/superCat/Desktop/PycharmProjectPortable/test/CategoryDB.db"
        seed_site_file_path = "/Users/superCat/Desktop/PycharmProjectPortable/Seeds/SiteFromResults.txt"
        db = CategorySeedSiteDB(seed_db_addr)
        basic_manager = CategoryManager()
        thread_pool_size = 20
        max_count = 5000

        category_manager = CategoryDBManager(category_db_addr)
        seed_manager = CategorySiteDBManager(
            CategorySeedSiteDB, db_path=save_seed_db_addr)  # was seed_db_addr
        seed_manager._max_site_limit = int(thread_pool_size * max_count * 0.75)

        counter = 0
        country_file_path = "/Users/superCat/Desktop/PycharmProjectPortable/SpamFilter/bad_country.txt"
        bad_countries = [
            x.upper()
            for x in FileIO.FileHandler.read_lines_from_file(country_file_path)
        ]

        def backlink_callback_inner(link_data):

            if isinstance(link_data, MajesticRefDomainStruct):
                if link_data.country in bad_countries or link_data.tf < 5 or link_data.tf > 95:
                    link_data = None
                    pass
                else:
                    link_data = MajesticBacklinkDataStruct(
                        ref_domain=link_data.domain,
                        backlink=link_data.domain,
                        src_tf=link_data.tf,
                        src_cf=link_data.cf,
                        src_topic=link_data.src_topic,
                        src_topical_tf=link_data.src_topic_tf,
                        country_code=link_data.country,
                        potential_url=link_data.potential_url)

            if isinstance(link_data, MajesticBacklinkDataStruct):
                if len(link_data.src_topic) > 1:
                    decoded_topic = basic_manager.decode_sub_category(
                        link_data.src_topic, False)
                    # print(backlink)
                    Logging.CsvLogger.log_to_file_path(logging_path, [
                        link_data.to_tuple(),
                    ])
                    seed_manager.append_to_buff(data=link_data,
                                                category=str(decoded_topic))

        total_count = 0
        seed_init_limit = 400
        seed_depth_limit = 3000
        temp_niches = []
        niches = []

        for niche in temp_niches:  # make valid niche for seeds
            # if niche.endswith("General"):
            #     niches.append(niche.rstrip("General"))
            # else:
            niches.append(niche)

        forbidden_list = [
            "bbc.co.uk", "wikipedia.org", "youtube.com", "amazon.co.uk",
            "facebook.com", "google.com", ".ru", ".cn", ".jp"
        ]
        for niche in niches:
            decoded_topic = basic_manager.decode_sub_category(niche, False)
            print(decoded_topic)
        minimum_tf = 25
        temp_sites = []
        target_ca = [
            "Society/Law", "Society/Politics", "Society/Issues",
            "Business/Financial Services", "Society/Government"
        ]
        sites = []
        parameters = {"TF": minimum_tf}
        key_words = [
            "Alcohol law", "Banking law", "Antitrust law", "Aviation law",
            "Corporate law", "Communications law", "Construction law",
            "Consumer law", "Drug control law", "Insurance law", "Tax law"
        ]
        # for item in key_words:
        #     temp_sites += GoogleCom.get_sites(keyword=item, index=0, filter_list=forbidden_list, blog=True)[0:]
        #     print("sites count:", len(temp_sites))
        #     time.sleep(2)
        # temp_sites = FileHandler.read_lines_from_file(seed_site_file_path)
        # temp_sites = list(set(temp_sites))
        print("seeds total:", len(temp_sites))
        categories = db.get_sub_category_tables_name()
        for niche in niches:
            target_ca += [x for x in categories if niche in x]

        seed_count = 0
        load_limit = seed_init_limit * 4

        def check_ending(domain: str):
            is_wrong_ending = False
            for item in forbidden_list:
                if domain.endswith(item):
                    is_wrong_ending = True
                    break
            return not is_wrong_ending

        for ca in target_ca:
            temp_sites += [
                y for y in filter(check_ending, [
                    x.ref_domain for x in db.get_from_table(ca,
                                                            0,
                                                            load_limit,
                                                            parameters,
                                                            reverse_read=True,
                                                            random_read=True)
                ])
            ]

        db.close()

        seed_count = len(temp_sites)
        # seed_init_limit = seed_count  #---------------------------

        if seed_count <= seed_init_limit:
            sites = temp_sites
        elif seed_init_limit < seed_count <= seed_init_limit * 2:
            sites = temp_sites[::2]
        else:
            while len(sites) < seed_init_limit:
                site = temp_sites[random.randint(0, seed_count - 1)]
                if site not in sites:
                    sites.append(site)
        # GoogleMajestic.get_sites_by_seed_sites(majestic, sites, catagories=niches, iteration=1,
        #                                                    count_per_domain=max_count, callback=backlink_callback_inner,
        #                                                    max_count=seed_depth_limit, tf=minimum_tf)
        GoogleMajestic.get_sites_by_seed_sites_muti_threads(
            majestic,
            sites,
            catagories=target_ca,
            iteration=4,
            count_per_domain=max_count,
            callback=backlink_callback_inner,
            max_count=seed_depth_limit + seed_init_limit,
            thread_pool_size=thread_pool_size,
            tf=minimum_tf,
            get_backlinks=False,
            bad_country_list=bad_countries)
        seed_manager.close()