def testPrintSeedDBSingleNiche(self): seed_db_addr = "/Users/superCat/Desktop/PycharmProjectPortable/Seeds/CategorySeedDB.db" parameters = {"TF": 0} db = CategorySeedSiteDB(seed_db_addr) total = db.get_total("Society/Law", **parameters) db.close() print(total)
def testeedExport(self): seed_db_addr = "/Users/superCat/Desktop/PycharmProjectPortable/sync/SeedSitesList" seed_db = SeedSiteDB("26/10/2015 Marketing CF20", db_addr=seed_db_addr) categoy_db_addr = "/Users/superCat/Desktop/PycharmProjectPortable/Seeds/CategorySeedDB.db" db = CategorySeedSiteDB(categoy_db_addr) # seed_manager = CategorySiteDBManager(CategorySeedSiteDB, db_path=categoy_db_addr) categories = db.get_sub_category_tables_name() target_ca = [ x for x in categories if "Business/Marketing and Advertising" in x ] sites = [] seeds_needed = 20000 percentage = 1 parameters = { "CF": 20, } for ca in target_ca: sites.clear() count = db.get_total(ca) if percentage == 1 and count > seeds_needed: count = seeds_needed count = int(percentage * count) if count > 0: temp = db.get_from_table(ca, 0, count, random_read=False, filter_dict=parameters) for item in temp: if isinstance(item, MajesticBacklinkDataStruct): sites.append((item.ref_domain, 0)) seed_db.add_sites(sites, skip_check=True) seed_db.close()
def get_seeds(categoy_db_addr: str, seed_limit: int, niche: str, parameters: dict): db = CategorySeedSiteDB(categoy_db_addr) # if niche.endswith("General"): # niche = niche.rstrip("General") temp_sites = [] target_ca = [] sites = [] categories = db.get_sub_category_tables_name() target_ca += [x for x in categories if niche in x] load_limit = seed_limit * 5 for ca in target_ca: print("getting seeds from:", ca) temp_sites += [ x.ref_domain for x in db.get_from_table( ca, 0, load_limit, random_read=True, filter_dict=parameters) ] seed_count = len(temp_sites) if seed_count <= seed_limit: sites = temp_sites elif seed_limit < seed_count <= seed_limit * 2: sites = temp_sites[::2] else: while len(sites) < seed_limit: site = temp_sites[random.randint(0, seed_count - 1)] if site not in sites: sites.append(site) return sites
def testPrintSeedDB(self): seed_db_addr = "/Users/superCat/Desktop/PycharmProjectPortable/Seeds/CategorySeedDB3.db" log_file_path = "/Users/superCat/Desktop/PycharmProjectPortable/Seeds/SeedLog3.csv" enable_log = True FileHandler.remove_file_if_exist(log_file_path) db = CategorySeedSiteDB(seed_db_addr) # seed_manager = CategorySiteDBManager(CategorySeedSiteDB, db_path=seed_db_addr) categories = db.get_sub_category_tables_name() total_count = 0 target_niche = "" parameters = {"TF": 0} if enable_log: CsvLogger.log_to_file_path(log_file_path, [ ("parameters", str(parameters)), ]) # parameters = {"TF": 20} for item in categories: if target_niche in item or len(target_niche) == 0: count = db.get_total(item, **parameters) total_count += count print(item, " ", count) if enable_log: CsvLogger.log_to_file_path(log_file_path, [ (item, str(count)), ]) print("total:", total_count) if enable_log: CsvLogger.log_to_file_path(log_file_path, [ ("total", str(total_count)), ])
def testSeedsUpload_General(self) -> int: categoy_db_addr = "/Users/superCat/Desktop/PycharmProjectPortable/Seeds/CategorySeedDB3.db" # categoy_db_addr = "/Users/superCat/Desktop/PycharmProjectPortable/Seeds/CategorySeedDB2.db" db = CategorySeedSiteDB(categoy_db_addr) categories = db.get_sub_category_tables_name() forbidden_niche = ['Adult/', 'Gambling', 'Law', 'Directory'] target_ca = {} seed_count = 200 for item in categories: if not any(x in item for x in forbidden_niche): target_ca.update({item: seed_count}) # seeds_needed = 20000 total_seeds = 0 in_data = MiningList(ref=seed_db_name, data=[]) parameters = {"TF": 5} counter = 0 for ca, seeds_needed in target_ca.items(): sites = get_seeds_normal(categoy_db_addr, seeds_needed, ca, parameters) total_seeds += len(sites) print(counter, " doing site:", ca, " size:", len(sites), " total:", total_seeds) in_data.data += sites counter += 1 ser = get_server() hostController = HostController(ser, cmd=ServerCommand.Com_Add_Seed, in_data=in_data) hostController.start() hostController.join() if len(target_ca) > 1: return total_seeds * 0.97 else: return total_seeds
def get_seeds_normal(categoy_db_addr: str, seed_limit: int, niche: str, parameters: dict): db = CategorySeedSiteDB(categoy_db_addr) temp = [ x.ref_domain for x in db.get_from_table( niche, 0, seed_limit, random_read=False, filter_dict=parameters) ] db.close() return temp
def testImportSeeds(self): seed_db_addr = "/Users/superCat/Desktop/PycharmProjectPortable/Seeds/CategorySeedDB.db" category_db_addr = "/Users/superCat/Desktop/PycharmProjectPortable/test/CategoryDB.db" db = CategorySeedSiteDB(seed_db_addr) basic_manager = CategoryManager() category_manager = CategoryDBManager(category_db_addr) seed_manager = CategorySiteDBManager(CategorySeedSiteDB, db_path=seed_db_addr) import csv path = "/Users/superCat/Desktop/PycharmProjectPortable/Seeds/Gambling3.csv" counter = 0 with open(path, mode='r', newline='', encoding='utf-8') as csv_file: # lines = len(csv_file.readlines()) rd = csv.reader(csv_file, delimiter=',') for row in rd: try: if len(row) == 6: domain, backlink, tf, cf, topic, topical_tf = row if len(topic) > 0: decoded_topic = basic_manager.decode_sub_category( topic, False) data = MajesticBacklinkDataStruct( ref_domain=domain, src_cf=int(cf), src_tf=int(tf), src_topic=str(decoded_topic), src_topical_tf=int(topical_tf)) seed_manager.append_to_buff( data=data, category=str(decoded_topic)) except Exception as ex: print(ex, "row:", row) finally: counter += 1 print("current loc:", counter, "data:", row) seed_manager.close()
def test_db_migration(self): old_seed_db_addr = "/Users/superCat/Desktop/PycharmProjectPortable/Seeds/CategorySeedDB.db" new_seed_db_addr = "/Users/superCat/Desktop/PycharmProjectPortable/Seeds/NewCategorySeedDB.db" old_db = CategorySeedSiteDB(old_seed_db_addr) new_db = CategorySeedSiteDB(new_seed_db_addr) old_tables = [x[0] for x in old_db.cur.execute("SELECT name FROM sqlite_master WHERE type='table';")] print('table length:', len(old_tables)) for table in old_tables: print('doing table:', table) data = old_db.get_from_table(table, 0, 10000000, reverse_read=False, random_read=False) print('data len for table:', len(data)) new_db.save_to_table(table, data) old_db.close() new_db.close()
def test_merge_db(self): merge_from = "/Users/superCat/Desktop/PycharmProjectPortable/Seeds/CategorySeedDB2.db" merge_to = "/Users/superCat/Desktop/PycharmProjectPortable/Seeds/CategorySeedDB.db" db_from = CategorySeedSiteDB(merge_from) db_to = CategorySeedSiteDB(merge_to) from_cat = db_from.get_sub_category_tables_name() for item in from_cat: results = db_from.get_from_table(item, 0, 10000000, reverse_read=False, random_read=False) print("adding:", item, "result:", len(results)) db_to.save_to_table(item, results)
def testSeedsUpload2(self) -> int: categoy_db_addr = "/Users/superCat/Desktop/PycharmProjectPortable/Seeds/NewCategorySeedDB.db" # categoy_db_addr = "/Users/superCat/Desktop/PycharmProjectPortable/Seeds/CategorySeedDB2.db" target_ca = { "Games/Gambling": 62000, # "Society/Politics":20000, # "Society/Issues":20000, # "Business/Financial Services":20000, } # seeds_needed = 20000 total_seeds = 0 parameters = {"TF": 0} db = CategorySeedSiteDB(categoy_db_addr) for ca, seeds_needed in target_ca.items(): sites = [ x.ref_domain for x in db.get_from_table(ca, 68000, seeds_needed, random_read=False, reverse_read=False, filter_dict=parameters) ] # sites = get_seeds_normal(categoy_db_addr, seeds_needed, ca, parameters) total_seeds += len(sites) print("doing site:", ca, " size:", len(sites)) in_data = MiningList(ref=seed_db_name, data=sites) ser = get_server() hostController = HostController(ser, cmd=ServerCommand.Com_Add_Seed, in_data=in_data) hostController.start() hostController.join() db.close() if len(target_ca) > 1: return total_seeds * 0.97 else: return total_seeds
def testUpload1(self): sock = StreamSocket() processor = CommandProcessor(sock.rfile, sock.wfile) seed_db_addr = "/Users/superCat/Desktop/PycharmProjectPortable/sync/SeedSitesList" seed_db = SeedSiteDB("09/11/2015", db_addr=seed_db_addr) categoy_db_addr = "/Users/superCat/Desktop/PycharmProjectPortable/Seeds/CategorySeedDB.db" db = CategorySeedSiteDB(categoy_db_addr) target_ca = [ "Health/General", "Health/Nutrition", "Shopping/Clothing", "Computers/Internet/Web Design and Development", "Society/People", "Home/Gardening", "Computers/Hardware", "Recreation/Food" ] seeds_needed = 5000 parameters = {"CF": 0} for ca in target_ca: sites = get_seeds(categoy_db_addr, seeds_needed, ca, parameters) print("doing site:", ca, " size:", len(sites)) seed_db.add_sites(sites, skip_check=False) seed_db.close()
def testImportSeeds0(self): seed_db_addr = "/Users/superCat/Desktop/PycharmProjectPortable/Seeds/NewCategorySeedDB.db" path = "/Users/superCat/Desktop/PycharmProjectPortable/Seeds/Gambling3.csv" db = CategorySeedSiteDB(seed_db_addr) with open(path, mode='rt') as csv_file: # lines = len(csv_file.readlines()) rd = csv.reader(csv_file, delimiter=',') header = next(rd) # skip header counter = 0 temp = [] while True: try: row = next(rd) if len(row) == 0: break if len(row) == 6: domain, backlink, tf, cf, topic, topical_tf = row print("current loc:", counter, "data:", row) # if len(topic) > 0: # decoded_topic = basic_manager.decode_sub_category(topic, False) data = MajesticBacklinkDataStruct( ref_domain=domain, src_cf=int(cf), src_tf=int(tf), src_topical_tf=int(topical_tf)) temp.append(data) except StopIteration: print('stop iteration') break except Exception as ex: print("exception:", str(ex), "row:", str(counter)) if len(str(ex)) == 0: break finally: counter += 1 db.save_to_table('Games/Gambling', temp) db.close()
def test_db_migration(self): old_seed_db_addr = "/Users/superCat/Desktop/PycharmProjectPortable/Seeds/CategorySeedDB.db" new_seed_db_addr = "/Users/superCat/Desktop/PycharmProjectPortable/Seeds/NewCategorySeedDB.db" old_db = CategorySeedSiteDB(old_seed_db_addr) new_db = CategorySeedSiteDB(new_seed_db_addr) old_tables = [ x[0] for x in old_db.cur.execute( "SELECT name FROM sqlite_master WHERE type='table';") ] print('table length:', len(old_tables)) for table in old_tables: print('doing table:', table) data = old_db.get_from_table(table, 0, 10000000, reverse_read=False, random_read=False) print('data len for table:', len(data)) new_db.save_to_table(table, data) old_db.close() new_db.close()
def testGetSeedsFromBacklinks(self): import random import time logging_path = "/Users/superCat/Desktop/PycharmProjectPortable/Seeds/GeneralSeed5.csv" # seed_db_addr = "/Users/superCat/Desktop/PycharmProjectPortable/Seeds/CategorySeedDB_WithCountry_Temp.db" seed_db_addr = "/Users/superCat/Desktop/PycharmProjectPortable/Seeds/CategorySeedDB3.db" # logging_path = "/Users/superCat/Desktop/PycharmProjectPortable/Seeds/GeneralSeed4.csv" # seed_db_addr = "/Users/superCat/Desktop/PycharmProjectPortable/Seeds/CategorySeedDB.db" save_seed_db_addr = "/Users/superCat/Desktop/PycharmProjectPortable/Seeds/CategorySeedDB3.db" category_db_addr = "/Users/superCat/Desktop/PycharmProjectPortable/test/CategoryDB.db" seed_site_file_path = "/Users/superCat/Desktop/PycharmProjectPortable/Seeds/SiteFromResults.txt" db = CategorySeedSiteDB(seed_db_addr) basic_manager = CategoryManager() thread_pool_size = 20 max_count = 5000 category_manager = CategoryDBManager(category_db_addr) seed_manager = CategorySiteDBManager( CategorySeedSiteDB, db_path=save_seed_db_addr) # was seed_db_addr seed_manager._max_site_limit = int(thread_pool_size * max_count * 0.75) counter = 0 country_file_path = "/Users/superCat/Desktop/PycharmProjectPortable/SpamFilter/bad_country.txt" bad_countries = [ x.upper() for x in FileIO.FileHandler.read_lines_from_file(country_file_path) ] def backlink_callback_inner(link_data): if isinstance(link_data, MajesticRefDomainStruct): if link_data.country in bad_countries or link_data.tf < 5 or link_data.tf > 95: link_data = None pass else: link_data = MajesticBacklinkDataStruct( ref_domain=link_data.domain, backlink=link_data.domain, src_tf=link_data.tf, src_cf=link_data.cf, src_topic=link_data.src_topic, src_topical_tf=link_data.src_topic_tf, country_code=link_data.country, potential_url=link_data.potential_url) if isinstance(link_data, MajesticBacklinkDataStruct): if len(link_data.src_topic) > 1: decoded_topic = basic_manager.decode_sub_category( link_data.src_topic, False) # print(backlink) Logging.CsvLogger.log_to_file_path(logging_path, [ link_data.to_tuple(), ]) seed_manager.append_to_buff(data=link_data, category=str(decoded_topic)) total_count = 0 seed_init_limit = 400 seed_depth_limit = 3000 temp_niches = [] niches = [] for niche in temp_niches: # make valid niche for seeds # if niche.endswith("General"): # niches.append(niche.rstrip("General")) # else: niches.append(niche) forbidden_list = [ "bbc.co.uk", "wikipedia.org", "youtube.com", "amazon.co.uk", "facebook.com", "google.com", ".ru", ".cn", ".jp" ] for niche in niches: decoded_topic = basic_manager.decode_sub_category(niche, False) print(decoded_topic) minimum_tf = 25 temp_sites = [] target_ca = [ "Society/Law", "Society/Politics", "Society/Issues", "Business/Financial Services", "Society/Government" ] sites = [] parameters = {"TF": minimum_tf} key_words = [ "Alcohol law", "Banking law", "Antitrust law", "Aviation law", "Corporate law", "Communications law", "Construction law", "Consumer law", "Drug control law", "Insurance law", "Tax law" ] # for item in key_words: # temp_sites += GoogleCom.get_sites(keyword=item, index=0, filter_list=forbidden_list, blog=True)[0:] # print("sites count:", len(temp_sites)) # time.sleep(2) # temp_sites = FileHandler.read_lines_from_file(seed_site_file_path) # temp_sites = list(set(temp_sites)) print("seeds total:", len(temp_sites)) categories = db.get_sub_category_tables_name() for niche in niches: target_ca += [x for x in categories if niche in x] seed_count = 0 load_limit = seed_init_limit * 4 def check_ending(domain: str): is_wrong_ending = False for item in forbidden_list: if domain.endswith(item): is_wrong_ending = True break return not is_wrong_ending for ca in target_ca: temp_sites += [ y for y in filter(check_ending, [ x.ref_domain for x in db.get_from_table(ca, 0, load_limit, parameters, reverse_read=True, random_read=True) ]) ] db.close() seed_count = len(temp_sites) # seed_init_limit = seed_count #--------------------------- if seed_count <= seed_init_limit: sites = temp_sites elif seed_init_limit < seed_count <= seed_init_limit * 2: sites = temp_sites[::2] else: while len(sites) < seed_init_limit: site = temp_sites[random.randint(0, seed_count - 1)] if site not in sites: sites.append(site) # GoogleMajestic.get_sites_by_seed_sites(majestic, sites, catagories=niches, iteration=1, # count_per_domain=max_count, callback=backlink_callback_inner, # max_count=seed_depth_limit, tf=minimum_tf) GoogleMajestic.get_sites_by_seed_sites_muti_threads( majestic, sites, catagories=target_ca, iteration=4, count_per_domain=max_count, callback=backlink_callback_inner, max_count=seed_depth_limit + seed_init_limit, thread_pool_size=thread_pool_size, tf=minimum_tf, get_backlinks=False, bad_country_list=bad_countries) seed_manager.close()