def mk_eng_txt_files(self, **rewrite): """ Takes a list of texts parts feeds to translate, gets the translated text, stacks it together""" print "mk_eng_txt_files: RETRIEVING PAGE_LIST.........." page_list = c_m.l_of_l_read(self.page_list_path) if len(page_list) < 1: # handling empty page_list case print "mk_eng_txt_files: PAGE LIST IS NOT POPULATED, RUN HTML_File_Maker AND Text_Extractor MODULES FIRST" else: # handling page_list for partially transalted sites print "mk_eng_txt_files: IN CASE PAGE LIST ALREADY HAD SOME ENG_TEXT ENTRIES SETTING INITIAL new_page_list TO LAST KNOWN PAGE_LIST VERSION" self.new_page_list = copy(page_list) # iterating throug unique text per page txts for data_set in page_list: self.p_text_f_name = data_set[2] print "mk_eng_txt_files: TRANSLATING TEXT FROM FILE %s" % self.p_text_f_name self.eng_p_text_f_name = "eng_" + self.p_text_f_name self.eng_p_text_f_path = self.text_eng_folder_path + self.eng_p_text_f_name self.page_text = c_m.simply_read(self.text_folder_path, self.p_text_f_name) # if page has less than 10 symbols it is not translated if len(self.page_text) < 10: print "mk_eng_txt_files: NOT WORTH TRANSLATING, WRITING AS IS AND SKIPPING..." c_m.simply_write(self.page_text, self.eng_p_text_f_path) elif len(self.page_text) > self.max_page_length: print "mk_eng_txt_files: PAGE TEXT IS TOO LONG DEVIDING TO PARTS, TRANSLATING AND GETTING BACK FULL PAGE TEXT" text_output = self.get_text_parts(**rewrite) else: # 10 < len(page_text) < 2000 if rewrite["rewrite"]: print "mk_eng_txt_files: TRANSLATING IN REWRITE MODE" text_output = self.get_text() elif not os.path.exists(self.eng_p_text_f_path): print "mk_eng_txt_files: TRANSLATING IN ONLY ONCE MODE" text_output = self.get_text() else: print "mk_eng_txt_files: SKIPPING FILE, ALREADY TRANSLATED" # continue # print "WRITING TRANSLATED OUTPUT TO FILE: ", self.eng_p_text_f_name # c_m.simply_write(text_output, self.text_eng_folder_path, self.eng_p_text_f_name) data_set.append(self.eng_p_text_f_name ) # updating dataset with eng_text file name self.new_page_list.append( data_set) # updating page list with updated entry print "mk_eng_txt_files: DONE TRANSLATING SITE %s " % self.domain print "mk_eng_txt_files: UPDATING PAGE LIST WITH ENG TEXT FILE NAMES" c_m.l_of_l_write(self.new_page_list, self.page_list_path) print "mk_eng_txt_files: SITE TRANSLATION FINISHED, CLOSING CHROME WEBDIRVER" self.loaded_driver.quit()
def mk_htmls(self): """ Makes unicode local html copies creates page_list file with file_name and respective link url as data entries""" domain_link_list = self.scrape_domain_int_links() # driving links, to collect htmls from web and create htmls with removed img elements for ind in range(len(domain_link_list)): page_file_name = "page_" + str( ind + 1) + ".html" # gen file names for html file of page on site active_page_link = domain_link_list[ind] if ";;;" in active_page_link: # separator problems hedging self.links_contain_seprator.append( [active_page_link, page_file_name]) print "LINK CONTAINS SEPARATOR, CAN NOT ADD TO PAGE LIST, WILL RUIN DATASYSTEM, WILL BE ADDED TO SEPARATE FILE" else: BS_object = self.retrieve_html(active_page_link, self.domain_folder_name, self.html_folder_name, page_file_name) self.page_list.append([active_page_link, page_file_name]) self.bs_object_dict[active_page_link] = BS_object # if no ";;;" separator found in link, then write as usual if len(self.links_contain_seprator) > 0: c_m.l_of_l_write(self.links_contain_seprator, self.main_path, self.domain_folder_name, self.link_contains_separator) else: c_m.l_of_l_write(self.page_list, self.main_path, self.domain_folder_name, self.page_list_f_name) return self.bs_object_dict
def drive_input_links(self): # driving links input_links_nested_list = [[], []] while len(input_links_nested_list) > 0: # retieving input_link_list input_links_nested_list = c_m.l_of_l_read(self.link_input_path) url = input_links_nested_list[0][ 1] #input link in nested list type data table. # will raise out of range error if empty input link file is provided. print "drive_input_links: INPUT URL:", url # driving modules site_selector = Common_Paths(url) # System module # Comment out if you do not need site link management site_link_manager_sys = Site_Link_Manager(url) # checking if link is duplicate is_duplicate = site_link_manager_sys.check_if_duplicate() if is_duplicate: print "drive_input_links: GOING TO THE NEXT SITE LINK" input_links_nested_list.pop(0) c_m.l_of_l_write(input_links_nested_list, self.link_input_path) continue print "drive_input_links: DUBLI CHECK DONE" #/ # Comment out if you do not need html retrieval html_maker = HTML_File_Maker(url) site_bs_object_dict = html_maker.mk_htmls( ) # output html bs_object dict for next module print "drive_input_links: HTML FILES DONE BS OBJECTS READY" #/ # Comment out if you do not need text extraction txt_maker = Text_Extractor(url) txt_maker.mk_text_files(site_bs_object_dict) print "drive_input_links: TEXT FILE MAKER DONE" #/ # Comment out if you do not need english translation translator = Translator(url) translator.mk_eng_txt_files() print "drive_input_links: TRANSLATOR DONE" print "drive_input_links: ADDING TO VERI GOOD FILE" #/ # Comment out if you do not need site link management site_link_manager_sys.add_to_veri_good() print "drive_input_links: LINK DONE, POPING..." input_links_nested_list.pop(0) print "drive_input_links: UPDATING INPUT LINK DATA" c_m.l_of_l_write(input_links_nested_list, self.link_input_path)
def react_to_highlighting_request(self, event): test_input_key_list = self.read_key_input_field() print "IN: react_to_highlighting_request" print "react_to_highlighting_request: COMPARING INIT KEYWORD LIST WITH A LIST AFTER \"RETURN\" BINDED ACTION" if len(test_input_key_list) > len(self.test_hi_key_list): print "react_to_highlighting_request: NEW HIGHLIGHT KEYs ADDED TO HIGHLIGHT LIST" #react_to_highlighting_request: find added keywords and add to key list added_key_list = [] for key in test_input_key_list: #react_to_highlighting_request: filtering out empty keys and leaving which are not in previous list if key not in self.test_hi_key_list and len(key) > 0: added_key = key added_key_list.append( added_key ) # highlight_keys_in_site tekes in keys as list print "react_to_highlighting_request: HIGHLIGHTING KEYS %r" % added_key_list self.highlight_keys_in_site(added_key_list) #react_to_highlighting_request: updating key list in local file self.init_h_key_list[0] = test_input_key_list c_m.l_of_l_write(self.init_h_key_list, self.all_hlights_f_path) self.test_hi_key_list = test_input_key_list elif len(test_input_key_list) == len( self.test_hi_key_list): # enter presed without adding anything print "react_to_highlighting_request: ENTER PRESSED WITHOUT ADDING ANY HIGHLIGHTS" self.write_key_input_field(test_input_key_list, True) # rewrite True else: #react_to_highlighting_request: details: less keys are present in key input field than previous image of the field - self.test_hi_key_list print "react_to_highlighting_request: HIGHLIGHT KEYWORDs DELETED: ", #react_to_highlighting_request: need to know which highlight is removed. for h_key in self.test_hi_key_list: if h_key in test_input_key_list: pass else: deleted_key = h_key print deleted_key self.remove_highlight(deleted_key) #react_to_highlighting_request: rewriting keys to have corect format self.write_key_input_field(test_input_key_list, True) #react_to_highlighting_request: write new key list to txt file self.init_h_key_list[0] = test_input_key_list c_m.l_of_l_write(self.init_h_key_list, self.all_hlights_f_path) self.test_hi_key_list = test_input_key_list
def mk_text_files(self, bs_object_dict): print "MAKING UNIQUE PAGE TEXT FILES" print "TOTAL PAGES TO BE PROCESSED", len(self.page_list) for ind in range(len(self.page_list)): copy_page_list = copy(self.page_list) poped_page_data_entry = self.page_list[ind] #forming page text file name split_page_html_f_name = poped_page_data_entry[1].split('.') page_text_f_name = split_page_html_f_name[0] + "_text.txt" #making a list of site text line, with active page excluded to be able to find unique text lines self.page_list[ind].append(page_text_f_name) copy_page_list.pop(ind) active_page_excluded_txt_l_list = [] for data_line in copy_page_list: active_link = data_line[0] active_bs_object = bs_object_dict[active_link] page_text_list = self.extract_page_text(active_bs_object) active_page_excluded_txt_l_list = active_page_excluded_txt_l_list + page_text_list #removing duplicates from almost all text list clean_excluded_txt_l_list = c_m.remove_duplicates( active_page_excluded_txt_l_list) #checking if text lines from active page link are present in other pages poped_bs_object = bs_object_dict[poped_page_data_entry[0]] poped_page_text_list = self.extract_page_text(poped_bs_object) unique_text_line_list = [] for text_line in poped_page_text_list: if text_line not in clean_excluded_txt_l_list: unique_text_line_list.append(text_line) else: pass #counting lengths of texts for sorting and writing text to file symbols_written_to_page = c_m.l_of_l_write( unique_text_line_list, self.text_folder_path, page_text_f_name) # write_to_file self.page_list[ind].append(str(symbols_written_to_page)) print self.page_list[ind][2], "DONE" c_m.l_of_l_write(self.page_list, self.page_list_path) #write new page list
def veri_good_b(self): """ Resets second try states, repopulates tabs with next site data, removes current site link from veri_ready and adds to veri_good file highlights newly added text""" veri_good_site_status = "VERI_GOOD" print "IN: veri_good_b. VERI GOOD BUTTON CLICKED" print "veri_good_b: REPOPULATING TABS WITH NEXT SITE DATA" #veri_good_b: RESETTING SECOND TRY MARKERS" self.eng_b_second_try = False # clearing english button tries self.original_b_second_try = False # clearing original button tries old_tab_data_set = self.init_tabs_data_set_list #veri_good_b: DELETING OLD TABS self.delete_tabs(old_tab_data_set) self.veri_ready_list[0][1] = veri_good_site_status current_site_data_entry = self.veri_ready_list[0] #veri_good_b: ADDING CLICKED AS GOOD SITE DATA ENTRY TO VERI GOOD FILE c_m.txt_file_append(current_site_data_entry, self.very_good_f_path) #veri_good_b: GOING TO THE NEXT SITE DATA ENTRY IN VERY READI LIST self.veri_ready_list.pop(0) next_site_data_entry = self.veri_ready_list[0] #veri_good_b: LOADING NEXT SITE TABS #veri_good_b: using global init_tabs_data_set_list parameter to delete tabs after button click self.init_tabs_data_set_list = self.mk_tabs_data_set( next_site_data_entry) self.laod_tabs(self.init_tabs_data_set_list, self.eng_text) print "veri_good_b: HIGHLIGHTING NEW SITE KEYWORDS" self.highlight_keys_in_site(self.test_hi_key_list) #veri_good_b: UPDATING VERI READY FILE WITH FIRST DELETED ENTRY VERI READY LIST c_m.l_of_l_write(self.veri_ready_list, self.very_ready_f_path)
def write_key_input_field(self, list_to_write, *arg): #write_key_input_field: failsafing input list no_empty = [] for item in list_to_write: no_white_space = item.strip() # stripping white spaces characters if len(no_white_space) > 1: no_empty.append(no_white_space) else: arg = True # if found at least one empty item setting to rewriting mode print "write_key_input_field: EMPTY, WHITE SPACE, OR ONE SYMBOL KEY PRESENT IN THE KEYLIST" if arg: #write_key_input_field rewriting mode self.input_as_text.delete('1.0', END) as_text = unicode("\n".join(no_empty)) #write_key_input_field updating global keylist and file keylist with corected version self.init_h_key_list[0] = no_empty c_m.l_of_l_write(self.init_h_key_list, self.all_hlights_f_path) else: as_text = unicode("\n".join(no_empty) + "\n") self.input_as_text.insert(END, as_text) return no_empty
def drive_input_links(self): """ DRIVES INPUT LINKS AND MODULES, FOR PARTIAL FUNCTIONALITY COMMENT OUT MARKED MODULES""" # driving links input_links_nested_list = [[], []] while len(input_links_nested_list) > 0: # retieving input_link_list input_links_nested_list = c_m.l_of_l_read(self.link_input_path) try: url = input_links_nested_list[0][ 1] #input link in nested list type data table. # will raise out of range error if empty input link file is provided. except IndexError: print "drive_input_links: IndexError: MOST LIKELY INPUT FILE IS EMPTY, ADD SOME SITE LINKS" break print "drive_input_links: INPUT URL:", url # driving modules site_selector = Common_Paths(url) # System module # ---COMMENT OUT if you do not need SITE LINK MANAGEMENT AND INPUT URL DUBLI CHECK: # site_link_manager_sys = Site_Link_Manager(url) # is_duplicate = site_link_manager_sys.check_if_duplicate() # if is_duplicate: # print "drive_input_links: INPUT SITE LINK IS DUPLICATE, TAKING NEXT LINK" # input_links_nested_list.pop(0) # c_m.l_of_l_write(input_links_nested_list, self.link_input_path) # continue # print "drive_input_links: DUBLI CHECK DONE" #/// # ---COMMENT OUT if you do not need HTML RETRIEVAL html_maker = HTML_File_Maker(url) site_bs_object_dict = html_maker.mk_htmls( ) # output html bs_object dict for next module if site_bs_object_dict is False: failed_entry = ["FAILED", url] c_m.txt_file_append(failed_entry, self.site_failed_path) input_links_nested_list.pop(0) continue print "drive_input_links: HTML FILES DONE BS OBJECTS READY" #/// # ---COMMENT OUT if you do not need TEXT EXTRACTION txt_maker = Text_Extractor(url) txt_maker.mk_text_files(site_bs_object_dict) print "drive_input_links: TEXT FILE MAKER DONE" #/// # ---COMMENT OUT if you do not need ENGLISH TRANSLATION translator = Translator(url) translator.mk_eng_txt_files(rewrite=False) print "drive_input_links: TRANSLATOR DONE" #/// # ---COMMENT OUT do not need VERI_GOOD FILE UPDATE. # print "drive_input_links: ADDING TO VERI GOOD FILE" # site_link_manager_sys.add_to_veri_good() # print "drive_input_links: LINK DONE, POPING..." #/// """ !!! POP INPUT LINK AND UPDATE INPUT LINK FILE NECESARRY FOR NON ETERNAL LOOP """ input_links_nested_list.pop(0) print "drive_input_links: UPDATING INPUT LINK DATA" c_m.l_of_l_write(input_links_nested_list, self.link_input_path) """ /// """
def mk_eng_txt_files(self): """ Takes a list of texts parts feeds to translate, gets the translated text, stacks it together""" self.page_list_path self.text_folder_path main_path = 'E:\\Python_work_files\\Projects\\site_data' main_list_f_name = "main_list.txt" new_page_list = [] print "RETRIEVING MAIN_LIST.........." main_list = c_m.l_of_l_read(self.page_list_path) # iterating throug unique text per page txts for data_set in main_list: p_text_f_name = data_set[2] print "TRANSLATING TEXT FROM FILE %s" % p_text_f_name eng_p_text_f_name = "eng_" + p_text_f_name error_eng_p_text_f_name = "error_" + eng_p_text_f_name page_text = c_m.simply_read(self.text_folder_path, p_text_f_name) # if page has less than 10 symbols it is not translated if len(page_text) < 10: c_m.simply_write(page_text, self.text_eng_folder_path, eng_p_text_f_name) continue # initial text output value for while loop text_output = "" # loop safety paramters track_while_loops = 0 max_while_loops = 5 # sleep times for not to abuse translate and for every part to work: ext_get_sleep = 2 text_translate_sleep = 1 text_paste_sleep = 1 text_copy_to_clip_sleep = 1 # big while cycle to submit input as long as no output is generated while len(text_output) == 0: # eternal loop safeguard if track_while_loops > max_while_loops: # raise error print "MAXIMUM TRIES TO TO TRANSLATE THE SAMPLE EXEEDED \npath: %s\\%s\\eng_TXT\\%s" % ( main_path, self.domain, error_eng_p_text_f_name) error_massage = "error when translating page" c_m.simply_write(error_massage, self.text_eng_folder_path, error_eng_p_text_f_name) # getting translate popup as html page: self.driver.get( "chrome-extension://aapbdbdomjkkjkaonfhkkikfgjllcleb/popup.html" ) print "SLEEPING %d seconds, after EXTENSION GET" % ext_get_sleep time.sleep(ext_get_sleep) print "POPULATING CLIPBOARD with text from file...." pyperclip.copy(page_text) print "Sleeping after copying to clipboard for %d s" % text_copy_to_clip_sleep time.sleep(text_copy_to_clip_sleep) # Finding text input element text_input_el = self.driver.find_element_by_id('text-input') #sending command via selenium Keys text_input_el.send_keys(Keys.CONTROL, 'v') print "Sleeping after pasting for %d" % text_paste_sleep time.sleep(text_paste_sleep) #submit to translate print "Pressing return" text_input_el.send_keys(Keys.RETURN) text_output_tries = 0 # initial number of small while loop tries # skips find output element and take text when trying for the first time if track_while_loops == 0 and eng_p_text_f_name == "eng_page_0_text.txt": track_while_loops += 1 continue # small while cycle waiting for input to be processed while text_output_tries < 5 and not text_output: text_output_tries += 1 print "sleeping after submition for %d s. Waiting for text to be translated" % text_translate_sleep time.sleep(text_translate_sleep) #find output element, take text try: text_output_el = self.driver.find_element_by_xpath( '//*[@id="translation"]/div[6]') text_output = text_output_el.text print "TRANS_TEST_1: input sample\n%r" % page_text print "TRANS_TEST_1: text_sample_ouput\n%r" % text_output except: print "Trying for %d time - THE TRANSLATED SAMPLE WAS NOT GENERATED" % text_output_tries print "WRITING TRANSLATED OUTPUT TO FILE: ", eng_p_text_f_name c_m.simply_write(text_output, self.text_eng_folder_path, eng_p_text_f_name) data_set.append(eng_p_text_f_name) new_page_list.append(data_set) track_while_loops += 1 # incrementing tries print "DONE TRANSLATING SITE %s " % self.domain print "UPDATING PAGE LIST WITH ENG TEXT FILE NAMES" c_m.l_of_l_write(new_page_list, self.page_list_path) self.driver.quit() # working chrome window closing