def veri_good_b(self): """ Resets second try states, repopulates tabs with next site data, removes current site link from veri_ready and adds to veri_good file highlights newly added text""" veri_good_site_status = "VERI_GOOD" print "IN: veri_good_b. VERI GOOD BUTTON CLICKED" print "veri_good_b: REPOPULATING TABS WITH NEXT SITE DATA" #veri_good_b: RESETTING SECOND TRY MARKERS" self.eng_b_second_try = False # clearing english button tries self.original_b_second_try = False # clearing original button tries old_tab_data_set = self.init_tabs_data_set_list #veri_good_b: DELETING OLD TABS self.delete_tabs(old_tab_data_set) self.veri_ready_list[0][1] = veri_good_site_status current_site_data_entry = self.veri_ready_list[0] #veri_good_b: ADDING CLICKED AS GOOD SITE DATA ENTRY TO VERI GOOD FILE c_m.txt_file_append(current_site_data_entry, self.very_good_f_path) #veri_good_b: GOING TO THE NEXT SITE DATA ENTRY IN VERY READI LIST self.veri_ready_list.pop(0) next_site_data_entry = self.veri_ready_list[0] #veri_good_b: LOADING NEXT SITE TABS #veri_good_b: using global init_tabs_data_set_list parameter to delete tabs after button click self.init_tabs_data_set_list = self.mk_tabs_data_set( next_site_data_entry) self.laod_tabs(self.init_tabs_data_set_list, self.eng_text) print "veri_good_b: HIGHLIGHTING NEW SITE KEYWORDS" self.highlight_keys_in_site(self.test_hi_key_list) #veri_good_b: UPDATING VERI READY FILE WITH FIRST DELETED ENTRY VERI READY LIST c_m.l_of_l_write(self.veri_ready_list, self.very_ready_f_path)
def check_if_duplicate(self): # loads lists from ready, verified as good and bad text files print "READing VERI READY LIST" veri_ready_list = c_m.l_of_l_read(self.vr_file_path) print "READ VERI GOOD" veri_good_list = c_m.l_of_l_read(self.vg_file_path) print "READ VERI BAD" veri_bad_list = c_m.l_of_l_read(self.vb_file_path) #/ print "MERGING TO ALL ENTRIES LISTS......" old_entries_list = veri_ready_list + veri_good_list + veri_bad_list # adding all lists into one print "EXTRACTING BASE URL FOR DUPLI CHECKING END ID AS LINK" old_base_url_id_dict = {} for entry in old_entries_list: current_site_link_id = entry[0] print "CURRENT ENTRY", entry current_site_base_url = entry[3] old_base_url_id_dict[current_site_base_url] = current_site_link_id self.old_id_list.append( current_site_link_id) # for next id generation try: old_base_url_id_dict[self.base_url] print "DUPLICATE LINK FOUND" print "ADDING TO DUPLICATE link FILE\npath:%s" % self.dupli_file_path dupli_entry_pack = [ old_base_url_id_dict[self.base_url], "DUPLICATE_BASE_URL", self.base_url ] c_m.txt_file_append(dupli_entry_pack, self.dupli_file_path) return True except KeyError: print "INPUT SITE LINK IS UNIQUE" return False
def drive_input_links(self): """ DRIVES INPUT LINKS AND MODULES, FOR PARTIAL FUNCTIONALITY COMMENT OUT MARKED MODULES""" # driving links input_links_nested_list = [[], []] while len(input_links_nested_list) > 0: # retieving input_link_list input_links_nested_list = c_m.l_of_l_read(self.link_input_path) try: url = input_links_nested_list[0][ 1] #input link in nested list type data table. # will raise out of range error if empty input link file is provided. except IndexError: print "drive_input_links: IndexError: MOST LIKELY INPUT FILE IS EMPTY, ADD SOME SITE LINKS" break print "drive_input_links: INPUT URL:", url # driving modules site_selector = Common_Paths(url) # System module # ---COMMENT OUT if you do not need SITE LINK MANAGEMENT AND INPUT URL DUBLI CHECK: # site_link_manager_sys = Site_Link_Manager(url) # is_duplicate = site_link_manager_sys.check_if_duplicate() # if is_duplicate: # print "drive_input_links: INPUT SITE LINK IS DUPLICATE, TAKING NEXT LINK" # input_links_nested_list.pop(0) # c_m.l_of_l_write(input_links_nested_list, self.link_input_path) # continue # print "drive_input_links: DUBLI CHECK DONE" #/// # ---COMMENT OUT if you do not need HTML RETRIEVAL html_maker = HTML_File_Maker(url) site_bs_object_dict = html_maker.mk_htmls( ) # output html bs_object dict for next module if site_bs_object_dict is False: failed_entry = ["FAILED", url] c_m.txt_file_append(failed_entry, self.site_failed_path) input_links_nested_list.pop(0) continue print "drive_input_links: HTML FILES DONE BS OBJECTS READY" #/// # ---COMMENT OUT if you do not need TEXT EXTRACTION txt_maker = Text_Extractor(url) txt_maker.mk_text_files(site_bs_object_dict) print "drive_input_links: TEXT FILE MAKER DONE" #/// # ---COMMENT OUT if you do not need ENGLISH TRANSLATION translator = Translator(url) translator.mk_eng_txt_files(rewrite=False) print "drive_input_links: TRANSLATOR DONE" #/// # ---COMMENT OUT do not need VERI_GOOD FILE UPDATE. # print "drive_input_links: ADDING TO VERI GOOD FILE" # site_link_manager_sys.add_to_veri_good() # print "drive_input_links: LINK DONE, POPING..." #/// """ !!! POP INPUT LINK AND UPDATE INPUT LINK FILE NECESARRY FOR NON ETERNAL LOOP """ input_links_nested_list.pop(0) print "drive_input_links: UPDATING INPUT LINK DATA" c_m.l_of_l_write(input_links_nested_list, self.link_input_path) """ /// """
def add_to_veri_good(self): next_id = self.make_next_id() entry_data_pack = [ next_id, "READY_FOR_VERI", self.input_url, self.base_url ] c_m.txt_file_append(entry_data_pack, self.vr_file_path)