def mk_eng_txt_files(self, **rewrite): """ Takes a list of texts parts feeds to translate, gets the translated text, stacks it together""" print "mk_eng_txt_files: RETRIEVING PAGE_LIST.........." page_list = c_m.l_of_l_read(self.page_list_path) if len(page_list) < 1: # handling empty page_list case print "mk_eng_txt_files: PAGE LIST IS NOT POPULATED, RUN HTML_File_Maker AND Text_Extractor MODULES FIRST" else: # handling page_list for partially transalted sites print "mk_eng_txt_files: IN CASE PAGE LIST ALREADY HAD SOME ENG_TEXT ENTRIES SETTING INITIAL new_page_list TO LAST KNOWN PAGE_LIST VERSION" self.new_page_list = copy(page_list) # iterating throug unique text per page txts for data_set in page_list: self.p_text_f_name = data_set[2] print "mk_eng_txt_files: TRANSLATING TEXT FROM FILE %s" % self.p_text_f_name self.eng_p_text_f_name = "eng_" + self.p_text_f_name self.eng_p_text_f_path = self.text_eng_folder_path + self.eng_p_text_f_name self.page_text = c_m.simply_read(self.text_folder_path, self.p_text_f_name) # if page has less than 10 symbols it is not translated if len(self.page_text) < 10: print "mk_eng_txt_files: NOT WORTH TRANSLATING, WRITING AS IS AND SKIPPING..." c_m.simply_write(self.page_text, self.eng_p_text_f_path) elif len(self.page_text) > self.max_page_length: print "mk_eng_txt_files: PAGE TEXT IS TOO LONG DEVIDING TO PARTS, TRANSLATING AND GETTING BACK FULL PAGE TEXT" text_output = self.get_text_parts(**rewrite) else: # 10 < len(page_text) < 2000 if rewrite["rewrite"]: print "mk_eng_txt_files: TRANSLATING IN REWRITE MODE" text_output = self.get_text() elif not os.path.exists(self.eng_p_text_f_path): print "mk_eng_txt_files: TRANSLATING IN ONLY ONCE MODE" text_output = self.get_text() else: print "mk_eng_txt_files: SKIPPING FILE, ALREADY TRANSLATED" # continue # print "WRITING TRANSLATED OUTPUT TO FILE: ", self.eng_p_text_f_name # c_m.simply_write(text_output, self.text_eng_folder_path, self.eng_p_text_f_name) data_set.append(self.eng_p_text_f_name ) # updating dataset with eng_text file name self.new_page_list.append( data_set) # updating page list with updated entry print "mk_eng_txt_files: DONE TRANSLATING SITE %s " % self.domain print "mk_eng_txt_files: UPDATING PAGE LIST WITH ENG TEXT FILE NAMES" c_m.l_of_l_write(self.new_page_list, self.page_list_path) print "mk_eng_txt_files: SITE TRANSLATION FINISHED, CLOSING CHROME WEBDIRVER" self.loaded_driver.quit()
def drive_input_links(self): # driving links input_links_nested_list = [[], []] while len(input_links_nested_list) > 0: # retieving input_link_list input_links_nested_list = c_m.l_of_l_read(self.link_input_path) url = input_links_nested_list[0][ 1] #input link in nested list type data table. # will raise out of range error if empty input link file is provided. print "drive_input_links: INPUT URL:", url # driving modules site_selector = Common_Paths(url) # System module # Comment out if you do not need site link management site_link_manager_sys = Site_Link_Manager(url) # checking if link is duplicate is_duplicate = site_link_manager_sys.check_if_duplicate() if is_duplicate: print "drive_input_links: GOING TO THE NEXT SITE LINK" input_links_nested_list.pop(0) c_m.l_of_l_write(input_links_nested_list, self.link_input_path) continue print "drive_input_links: DUBLI CHECK DONE" #/ # Comment out if you do not need html retrieval html_maker = HTML_File_Maker(url) site_bs_object_dict = html_maker.mk_htmls( ) # output html bs_object dict for next module print "drive_input_links: HTML FILES DONE BS OBJECTS READY" #/ # Comment out if you do not need text extraction txt_maker = Text_Extractor(url) txt_maker.mk_text_files(site_bs_object_dict) print "drive_input_links: TEXT FILE MAKER DONE" #/ # Comment out if you do not need english translation translator = Translator(url) translator.mk_eng_txt_files() print "drive_input_links: TRANSLATOR DONE" print "drive_input_links: ADDING TO VERI GOOD FILE" #/ # Comment out if you do not need site link management site_link_manager_sys.add_to_veri_good() print "drive_input_links: LINK DONE, POPING..." input_links_nested_list.pop(0) print "drive_input_links: UPDATING INPUT LINK DATA" c_m.l_of_l_write(input_links_nested_list, self.link_input_path)
def check_if_duplicate(self): # loads lists from ready, verified as good and bad text files print "READing VERI READY LIST" veri_ready_list = c_m.l_of_l_read(self.vr_file_path) print "READ VERI GOOD" veri_good_list = c_m.l_of_l_read(self.vg_file_path) print "READ VERI BAD" veri_bad_list = c_m.l_of_l_read(self.vb_file_path) #/ print "MERGING TO ALL ENTRIES LISTS......" old_entries_list = veri_ready_list + veri_good_list + veri_bad_list # adding all lists into one print "EXTRACTING BASE URL FOR DUPLI CHECKING END ID AS LINK" old_base_url_id_dict = {} for entry in old_entries_list: current_site_link_id = entry[0] print "CURRENT ENTRY", entry current_site_base_url = entry[3] old_base_url_id_dict[current_site_base_url] = current_site_link_id self.old_id_list.append( current_site_link_id) # for next id generation try: old_base_url_id_dict[self.base_url] print "DUPLICATE LINK FOUND" print "ADDING TO DUPLICATE link FILE\npath:%s" % self.dupli_file_path dupli_entry_pack = [ old_base_url_id_dict[self.base_url], "DUPLICATE_BASE_URL", self.base_url ] c_m.txt_file_append(dupli_entry_pack, self.dupli_file_path) return True except KeyError: print "INPUT SITE LINK IS UNIQUE" return False
def mk_tabs_data_set(self, site_data_entry): """According to site data from 1_ready_for_veri file retrieves page data from page_list file, places homepage as first tab, sorts other according to page text symbol count from biggest down""" print "mk_tabs_data_set: RETRIEVING AND REARANGING PAGE LIST ENTRY DATA" base_url = site_data_entry[3] page_list_f_name = 'page_list.txt' url_domain = c_m.strip_to_domain(base_url) # domain acts as key in filesystem composed of folders named as domains print "FOR DOMAIN: ", url_domain domain_folder = url_domain + "\\" page_list_path = self.main_path + domain_folder + page_list_f_name page_list = c_m.l_of_l_read(page_list_path) print "mk_tabs_data_set: ADJUSTING PAGE LIST" adjusted_page_list = self.mk_adjusted_page_list(page_list) # sorting list according to unique page text length return adjusted_page_list
def drive_input_links(self): """ DRIVES INPUT LINKS AND MODULES, FOR PARTIAL FUNCTIONALITY COMMENT OUT MARKED MODULES""" # driving links input_links_nested_list = [[], []] while len(input_links_nested_list) > 0: # retieving input_link_list input_links_nested_list = c_m.l_of_l_read(self.link_input_path) try: url = input_links_nested_list[0][ 1] #input link in nested list type data table. # will raise out of range error if empty input link file is provided. except IndexError: print "drive_input_links: IndexError: MOST LIKELY INPUT FILE IS EMPTY, ADD SOME SITE LINKS" break print "drive_input_links: INPUT URL:", url # driving modules site_selector = Common_Paths(url) # System module # ---COMMENT OUT if you do not need SITE LINK MANAGEMENT AND INPUT URL DUBLI CHECK: # site_link_manager_sys = Site_Link_Manager(url) # is_duplicate = site_link_manager_sys.check_if_duplicate() # if is_duplicate: # print "drive_input_links: INPUT SITE LINK IS DUPLICATE, TAKING NEXT LINK" # input_links_nested_list.pop(0) # c_m.l_of_l_write(input_links_nested_list, self.link_input_path) # continue # print "drive_input_links: DUBLI CHECK DONE" #/// # ---COMMENT OUT if you do not need HTML RETRIEVAL html_maker = HTML_File_Maker(url) site_bs_object_dict = html_maker.mk_htmls( ) # output html bs_object dict for next module if site_bs_object_dict is False: failed_entry = ["FAILED", url] c_m.txt_file_append(failed_entry, self.site_failed_path) input_links_nested_list.pop(0) continue print "drive_input_links: HTML FILES DONE BS OBJECTS READY" #/// # ---COMMENT OUT if you do not need TEXT EXTRACTION txt_maker = Text_Extractor(url) txt_maker.mk_text_files(site_bs_object_dict) print "drive_input_links: TEXT FILE MAKER DONE" #/// # ---COMMENT OUT if you do not need ENGLISH TRANSLATION translator = Translator(url) translator.mk_eng_txt_files(rewrite=False) print "drive_input_links: TRANSLATOR DONE" #/// # ---COMMENT OUT do not need VERI_GOOD FILE UPDATE. # print "drive_input_links: ADDING TO VERI GOOD FILE" # site_link_manager_sys.add_to_veri_good() # print "drive_input_links: LINK DONE, POPING..." #/// """ !!! POP INPUT LINK AND UPDATE INPUT LINK FILE NECESARRY FOR NON ETERNAL LOOP """ input_links_nested_list.pop(0) print "drive_input_links: UPDATING INPUT LINK DATA" c_m.l_of_l_write(input_links_nested_list, self.link_input_path) """ /// """
def __init__(self, input_url): super(Text_Extractor, self).__init__(input_url) self.mk_filesystem(self.text_folder) # creates folder for txt output self.page_list = c_m.l_of_l_read( self.page_list_path ) # retrive page_list from file as list of lists
def __init__(self, root): print "__init__: BUILDING GUI" self.root = root root.title("VERI") self.cont = ttk.Frame(root, relief='sunken') # __init__: unusual tab positioning is achieved through ttk Style functionality self.change_noteb = ttk.Style() self.change_tab = ttk.Style() self.change_noteb.configure('TNotebook', tabposition='wn') self.change_tab.configure('TNotebook.Tab', borderwidth=5) #/ # __init__: ttk widget instantiation self.noteb = ttk.Notebook(self.cont) self.button_frame = ttk.Frame(self.cont, relief='sunken') self.good_button = Button(self.button_frame, text='VERI_GOOD', width=20, height=5, bg='green', command=self.veri_good_b) self.bad_button = Button(self.button_frame, text='VERI_BAD', width=20, height=5, bg='red', command=self.veri_bad_b) self.h_light_frame = Frame(self.cont, relief='raised', background='yellow', borderwidth='3') self.lang_b_frame = Frame(self.cont, relief='raised') self.eng_b = Button(self.lang_b_frame, text='english', relief='raised', command=self.load_eng_text) self.original_lang_b = Button(self.lang_b_frame, text='original', relief='raised', command=self.load_original_text) # __init__: text box for highlight input self.input_header = ttk.Label(self.h_light_frame, text='TEST_key', background='yellow') self.input_as_text = Text(self.h_light_frame, width=20, height=39, borderwidth=3, relief='sunken') #/ # __init__: gridding STATIC gui components self.cont.grid(column=0, row=0, sticky=(N, W, S, E)) self.button_frame.grid(column=0, row=0, sticky=(N, W, S, E)) self.noteb.grid(column=0, row=1, sticky=(N, W, S, E)) self.good_button.grid(column=0, row=0, sticky=(N, W)) self.bad_button.grid(column=1, row=0, sticky=(N, W)) self.lang_b_frame.grid(column=1, row=0, sticky=E) self.h_light_frame.grid(column=1, row=1, padx=3) self.eng_b.grid(column=0, row=0, sticky=(E), padx=5, pady=3) self.original_lang_b.grid(column=0, row=1, sticky=(E), padx=5, pady=3) self.input_header.grid(column=0, row=0, sticky=(N, S)) self.input_as_text.grid(column=0, row=1, sticky=(N, S)) #/ # __init__: instances of DYNAMIC notebook tabs # max no. == 20 of tabs instances self.t_tab0 = Text(self.noteb, width=100, height=40, font=("Arial", "10")) self.t_tab1 = Text(self.noteb) self.t_tab2 = Text(self.noteb) self.t_tab3 = Text(self.noteb) self.t_tab4 = Text(self.noteb) self.t_tab5 = Text(self.noteb) self.t_tab6 = Text(self.noteb) self.t_tab7 = Text(self.noteb) self.t_tab8 = Text(self.noteb) self.t_tab9 = Text(self.noteb) self.t_tab10 = Text(self.noteb) self.t_tab11 = Text(self.noteb) self.t_tab12 = Text(self.noteb) self.t_tab13 = Text(self.noteb) self.t_tab14 = Text(self.noteb) self.t_tab15 = Text(self.noteb) self.t_tab16 = Text(self.noteb) self.t_tab17 = Text(self.noteb) self.t_tab18 = Text(self.noteb) self.t_tab19 = Text(self.noteb) # __init__: purpose: adding to list for ease of acces self.t_tab_list = [ self.t_tab0, self.t_tab1, self.t_tab2, self.t_tab3, self.t_tab4, self.t_tab5, self.t_tab6, self.t_tab7, self.t_tab8, self.t_tab9, self.t_tab10, self.t_tab11, self.t_tab12, self.t_tab13, self.t_tab14, self.t_tab15, self.t_tab16, self.t_tab17, self.t_tab18, self.t_tab19 ] #/ print "__init__: GUI INSTANCE READY" print "__init__: INITIALIZING PATHS AND FILE NAMES" self.static_part_path = "site_selector\\site_data\\" self.pc_specific_part_path = os.path.dirname( os.path.realpath(__file__)) + "\\" self.main_path = self.pc_specific_part_path + self.static_part_path self.very_ready_f_name = '1_ready_for_veri.txt' self.very_ready_f_path = self.main_path + self.very_ready_f_name self.very_good_f_name = '2_1_good_veri.txt' self.very_good_f_path = self.main_path + self.very_good_f_name self.very_bad_f_name = '2_2_bad_veri.txt' self.very_bad_f_path = self.main_path + self.very_bad_f_name self.all_hlights_f_name = '3_1_all_highlights.txt' self.all_hlights_f_path = self.main_path + self.all_hlights_f_name print "__init__: ASSIGNING INITIAL VALUES FOR GLOBALY USED DYNAMIC VARIABLES" self.eng_text = False # initial loaded original text self.eng_b_second_try = False # button second click controler self.original_b_second_try = True # button second click controler, True - initial language original self.active_link_url_no_domain_list = [ ] # for page link without domain later use after adding sum key hit per page in tab header print "__init__: INITIAL TABS LOADING AND HIGHLIGHTING" #__init__: READING SITE LINK (very_ready) FILE" self.veri_ready_list = c_m.l_of_l_read(self.very_ready_f_path) #__init__: LOADING INITIAL TAB DATA SET ACCORDING TO FIRST SITE ENTRY IN veri_ready" self.init_site_data = self.veri_ready_list[0] self.init_tabs_data_set_list = self.mk_tabs_data_set( self.init_site_data) print "__init__: NUMBER OF PAGES TO BE LOADED %d" % len( self.init_tabs_data_set_list) self.laod_tabs(self.init_tabs_data_set_list, self.eng_text) print "__init__: INITIATING HIGHLIGHTS FUCTIONALITY" #__init__: READING HIGHLIGHT KEYWORDS FILE" self.init_h_key_list = c_m.l_of_l_read( self.all_hlights_f_path) # returns a list of lists self.key_position_lib = { } # initial key position lib for faster highlighting removal # handling empty h key list if len(self.init_h_key_list) > 0: #__init__: KEYWORDS PRESENT IN FILE INITIALIZING HIGHLIGHTING FUNCTIONALITY self.test_hi_key_list = self.init_h_key_list[ 0] # index zero currenlty is test highlights, can be added other, always_positive, always_negative #__init__: SHOWING KEYS FROM FILE IN KEYS INPUT FIELD self.test_hi_key_list = self.write_key_input_field( self.test_hi_key_list ) # adjusting for empty items in the text file #__init__: HIGHLIGHTING KEYS FROM FILE IN TAB\PAGE TEXT self.highlight_keys_in_site(self.test_hi_key_list) else: #__init__: THE INITIAL HIGHLIGHT KEY LIST IS EMPTY self.init_h_key_list = [ [], [], [] ] # assigning initial apropriate format empty list value self.test_hi_key_list = self.init_h_key_list[ 0] # test key list initial empty list value # "Key input field Return key binding to highlighting controler method" self.input_as_text.bind('<Return>', self.react_to_highlighting_request)
def mk_eng_txt_files(self): """ Takes a list of texts parts feeds to translate, gets the translated text, stacks it together""" self.page_list_path self.text_folder_path main_path = 'E:\\Python_work_files\\Projects\\site_data' main_list_f_name = "main_list.txt" new_page_list = [] print "RETRIEVING MAIN_LIST.........." main_list = c_m.l_of_l_read(self.page_list_path) # iterating throug unique text per page txts for data_set in main_list: p_text_f_name = data_set[2] print "TRANSLATING TEXT FROM FILE %s" % p_text_f_name eng_p_text_f_name = "eng_" + p_text_f_name error_eng_p_text_f_name = "error_" + eng_p_text_f_name page_text = c_m.simply_read(self.text_folder_path, p_text_f_name) # if page has less than 10 symbols it is not translated if len(page_text) < 10: c_m.simply_write(page_text, self.text_eng_folder_path, eng_p_text_f_name) continue # initial text output value for while loop text_output = "" # loop safety paramters track_while_loops = 0 max_while_loops = 5 # sleep times for not to abuse translate and for every part to work: ext_get_sleep = 2 text_translate_sleep = 1 text_paste_sleep = 1 text_copy_to_clip_sleep = 1 # big while cycle to submit input as long as no output is generated while len(text_output) == 0: # eternal loop safeguard if track_while_loops > max_while_loops: # raise error print "MAXIMUM TRIES TO TO TRANSLATE THE SAMPLE EXEEDED \npath: %s\\%s\\eng_TXT\\%s" % ( main_path, self.domain, error_eng_p_text_f_name) error_massage = "error when translating page" c_m.simply_write(error_massage, self.text_eng_folder_path, error_eng_p_text_f_name) # getting translate popup as html page: self.driver.get( "chrome-extension://aapbdbdomjkkjkaonfhkkikfgjllcleb/popup.html" ) print "SLEEPING %d seconds, after EXTENSION GET" % ext_get_sleep time.sleep(ext_get_sleep) print "POPULATING CLIPBOARD with text from file...." pyperclip.copy(page_text) print "Sleeping after copying to clipboard for %d s" % text_copy_to_clip_sleep time.sleep(text_copy_to_clip_sleep) # Finding text input element text_input_el = self.driver.find_element_by_id('text-input') #sending command via selenium Keys text_input_el.send_keys(Keys.CONTROL, 'v') print "Sleeping after pasting for %d" % text_paste_sleep time.sleep(text_paste_sleep) #submit to translate print "Pressing return" text_input_el.send_keys(Keys.RETURN) text_output_tries = 0 # initial number of small while loop tries # skips find output element and take text when trying for the first time if track_while_loops == 0 and eng_p_text_f_name == "eng_page_0_text.txt": track_while_loops += 1 continue # small while cycle waiting for input to be processed while text_output_tries < 5 and not text_output: text_output_tries += 1 print "sleeping after submition for %d s. Waiting for text to be translated" % text_translate_sleep time.sleep(text_translate_sleep) #find output element, take text try: text_output_el = self.driver.find_element_by_xpath( '//*[@id="translation"]/div[6]') text_output = text_output_el.text print "TRANS_TEST_1: input sample\n%r" % page_text print "TRANS_TEST_1: text_sample_ouput\n%r" % text_output except: print "Trying for %d time - THE TRANSLATED SAMPLE WAS NOT GENERATED" % text_output_tries print "WRITING TRANSLATED OUTPUT TO FILE: ", eng_p_text_f_name c_m.simply_write(text_output, self.text_eng_folder_path, eng_p_text_f_name) data_set.append(eng_p_text_f_name) new_page_list.append(data_set) track_while_loops += 1 # incrementing tries print "DONE TRANSLATING SITE %s " % self.domain print "UPDATING PAGE LIST WITH ENG TEXT FILE NAMES" c_m.l_of_l_write(new_page_list, self.page_list_path) self.driver.quit() # working chrome window closing