def mk_eng_txt_files(self, **rewrite):
        """ Takes a list of texts parts feeds to translate,
        gets the translated text,
        stacks it together"""

        print "mk_eng_txt_files: RETRIEVING PAGE_LIST.........."
        page_list = c_m.l_of_l_read(self.page_list_path)

        if len(page_list) < 1:  # handling empty page_list case
            print "mk_eng_txt_files: PAGE LIST IS NOT POPULATED, RUN HTML_File_Maker AND Text_Extractor MODULES FIRST"
        else:  # handling page_list for partially transalted sites
            print "mk_eng_txt_files: IN CASE PAGE LIST ALREADY HAD SOME ENG_TEXT ENTRIES SETTING INITIAL new_page_list TO LAST KNOWN PAGE_LIST VERSION"
            self.new_page_list = copy(page_list)

        # iterating throug unique text per page txts
        for data_set in page_list:
            self.p_text_f_name = data_set[2]
            print "mk_eng_txt_files: TRANSLATING TEXT FROM FILE %s" % self.p_text_f_name

            self.eng_p_text_f_name = "eng_" + self.p_text_f_name
            self.eng_p_text_f_path = self.text_eng_folder_path + self.eng_p_text_f_name

            self.page_text = c_m.simply_read(self.text_folder_path,
                                             self.p_text_f_name)

            # if page has less than 10 symbols it is not translated
            if len(self.page_text) < 10:
                print "mk_eng_txt_files: NOT WORTH TRANSLATING, WRITING AS IS AND SKIPPING..."
                c_m.simply_write(self.page_text, self.eng_p_text_f_path)

            elif len(self.page_text) > self.max_page_length:
                print "mk_eng_txt_files: PAGE TEXT IS TOO LONG DEVIDING TO PARTS, TRANSLATING AND GETTING BACK FULL PAGE TEXT"
                text_output = self.get_text_parts(**rewrite)

            else:  # 10 < len(page_text) < 2000

                if rewrite["rewrite"]:
                    print "mk_eng_txt_files: TRANSLATING IN REWRITE MODE"
                    text_output = self.get_text()

                elif not os.path.exists(self.eng_p_text_f_path):
                    print "mk_eng_txt_files: TRANSLATING IN ONLY ONCE MODE"
                    text_output = self.get_text()

                else:
                    print "mk_eng_txt_files: SKIPPING FILE, ALREADY TRANSLATED"
                    # continue

            # print "WRITING TRANSLATED OUTPUT TO FILE: ", self.eng_p_text_f_name
            # c_m.simply_write(text_output, self.text_eng_folder_path, self.eng_p_text_f_name)
                data_set.append(self.eng_p_text_f_name
                                )  # updating dataset with eng_text file name
                self.new_page_list.append(
                    data_set)  # updating page list with updated entry

        print "mk_eng_txt_files: DONE TRANSLATING SITE %s " % self.domain
        print "mk_eng_txt_files: UPDATING PAGE LIST WITH ENG TEXT FILE NAMES"
        c_m.l_of_l_write(self.new_page_list, self.page_list_path)
        print "mk_eng_txt_files: SITE TRANSLATION FINISHED, CLOSING CHROME WEBDIRVER"
        self.loaded_driver.quit()
Esempio n. 2
0
    def drive_input_links(self):
        # driving links
        input_links_nested_list = [[], []]

        while len(input_links_nested_list) > 0:
            # retieving input_link_list
            input_links_nested_list = c_m.l_of_l_read(self.link_input_path)

            url = input_links_nested_list[0][
                1]  #input link in nested list type data table.
            # will raise out of range error if empty input link file is provided.
            print "drive_input_links: INPUT URL:", url

            # driving modules
            site_selector = Common_Paths(url)  # System module

            # Comment out if you do not need site link management
            site_link_manager_sys = Site_Link_Manager(url)
            # checking if link is duplicate
            is_duplicate = site_link_manager_sys.check_if_duplicate()
            if is_duplicate:
                print "drive_input_links: GOING TO THE NEXT SITE LINK"
                input_links_nested_list.pop(0)
                c_m.l_of_l_write(input_links_nested_list, self.link_input_path)
                continue
            print "drive_input_links: DUBLI CHECK DONE"
            #/

            # Comment out if you do not need html retrieval
            html_maker = HTML_File_Maker(url)
            site_bs_object_dict = html_maker.mk_htmls(
            )  # output html bs_object dict for next module
            print "drive_input_links: HTML FILES DONE BS OBJECTS READY"
            #/

            # Comment out if you do not need text extraction
            txt_maker = Text_Extractor(url)
            txt_maker.mk_text_files(site_bs_object_dict)
            print "drive_input_links: TEXT FILE MAKER DONE"
            #/

            # Comment out if you do not need english translation
            translator = Translator(url)
            translator.mk_eng_txt_files()
            print "drive_input_links: TRANSLATOR DONE"
            print "drive_input_links: ADDING TO VERI GOOD FILE"
            #/

            # Comment out if you do not need site link management
            site_link_manager_sys.add_to_veri_good()
            print "drive_input_links: LINK DONE, POPING..."
            input_links_nested_list.pop(0)
            print "drive_input_links: UPDATING INPUT LINK DATA"
            c_m.l_of_l_write(input_links_nested_list, self.link_input_path)
    def check_if_duplicate(self):

        # loads lists from ready, verified as good and bad text files
        print "READing VERI READY LIST"
        veri_ready_list = c_m.l_of_l_read(self.vr_file_path)
        print "READ VERI GOOD"
        veri_good_list = c_m.l_of_l_read(self.vg_file_path)
        print "READ VERI BAD"
        veri_bad_list = c_m.l_of_l_read(self.vb_file_path)

        #/
        print "MERGING TO ALL ENTRIES LISTS......"
        old_entries_list = veri_ready_list + veri_good_list + veri_bad_list  # adding all lists into one

        print "EXTRACTING BASE URL FOR DUPLI CHECKING END ID AS LINK"
        old_base_url_id_dict = {}
        for entry in old_entries_list:
            current_site_link_id = entry[0]

            print "CURRENT ENTRY", entry
            current_site_base_url = entry[3]
            old_base_url_id_dict[current_site_base_url] = current_site_link_id

            self.old_id_list.append(
                current_site_link_id)  # for next id generation

        try:
            old_base_url_id_dict[self.base_url]
            print "DUPLICATE LINK FOUND"
            print "ADDING TO DUPLICATE link FILE\npath:%s" % self.dupli_file_path
            dupli_entry_pack = [
                old_base_url_id_dict[self.base_url], "DUPLICATE_BASE_URL",
                self.base_url
            ]
            c_m.txt_file_append(dupli_entry_pack, self.dupli_file_path)
            return True
        except KeyError:
            print "INPUT SITE LINK IS UNIQUE"
            return False
    def mk_tabs_data_set(self, site_data_entry):

        """According to site data from 1_ready_for_veri file
        retrieves page data from page_list file,
        places homepage as first tab,
        sorts other according to page text symbol count from biggest down"""

        print "mk_tabs_data_set: RETRIEVING AND REARANGING PAGE LIST ENTRY DATA"
        base_url = site_data_entry[3]

        page_list_f_name = 'page_list.txt'
        url_domain = c_m.strip_to_domain(base_url) # domain acts as key in filesystem composed of folders named as domains
        print "FOR DOMAIN: ", url_domain
        domain_folder = url_domain + "\\"
        page_list_path = self.main_path + domain_folder + page_list_f_name
        page_list = c_m.l_of_l_read(page_list_path)

        print "mk_tabs_data_set: ADJUSTING PAGE LIST"
        adjusted_page_list = self.mk_adjusted_page_list(page_list) # sorting list according to unique page text length
        return adjusted_page_list
    def drive_input_links(self):
        """ DRIVES INPUT LINKS AND MODULES,
            FOR PARTIAL FUNCTIONALITY COMMENT OUT MARKED MODULES"""
        # driving links
        input_links_nested_list = [[], []]

        while len(input_links_nested_list) > 0:
            # retieving input_link_list
            input_links_nested_list = c_m.l_of_l_read(self.link_input_path)
            try:
                url = input_links_nested_list[0][
                    1]  #input link in nested list type data table.
                # will raise out of range error if empty input link file is provided.
            except IndexError:
                print "drive_input_links: IndexError: MOST LIKELY INPUT FILE IS EMPTY, ADD SOME SITE LINKS"
                break

            print "drive_input_links: INPUT URL:", url

            # driving modules
            site_selector = Common_Paths(url)  # System module

            # ---COMMENT OUT if you do not need SITE LINK MANAGEMENT AND INPUT URL DUBLI CHECK:
            # site_link_manager_sys = Site_Link_Manager(url)
            # is_duplicate = site_link_manager_sys.check_if_duplicate()
            # if is_duplicate:
            # print "drive_input_links: INPUT SITE LINK IS DUPLICATE, TAKING NEXT LINK"
            # input_links_nested_list.pop(0)
            # c_m.l_of_l_write(input_links_nested_list, self.link_input_path)
            # continue
            # print "drive_input_links: DUBLI CHECK DONE"
            #///

            # ---COMMENT OUT if you do not need HTML RETRIEVAL
            html_maker = HTML_File_Maker(url)
            site_bs_object_dict = html_maker.mk_htmls(
            )  # output html bs_object dict for next module
            if site_bs_object_dict is False:
                failed_entry = ["FAILED", url]
                c_m.txt_file_append(failed_entry, self.site_failed_path)
                input_links_nested_list.pop(0)
                continue
            print "drive_input_links: HTML FILES DONE BS OBJECTS READY"
            #///

            # ---COMMENT OUT if you do not need TEXT EXTRACTION
            txt_maker = Text_Extractor(url)
            txt_maker.mk_text_files(site_bs_object_dict)
            print "drive_input_links: TEXT FILE MAKER DONE"
            #///

            # ---COMMENT OUT if you do not need ENGLISH TRANSLATION
            translator = Translator(url)
            translator.mk_eng_txt_files(rewrite=False)
            print "drive_input_links: TRANSLATOR DONE"
            #///

            # ---COMMENT OUT do not need VERI_GOOD FILE UPDATE.
            # print "drive_input_links: ADDING TO VERI GOOD FILE"
            # site_link_manager_sys.add_to_veri_good()
            # print "drive_input_links: LINK DONE, POPING..."
            #///
            """ !!! POP INPUT LINK AND UPDATE INPUT LINK FILE NECESARRY FOR NON ETERNAL LOOP """
            input_links_nested_list.pop(0)
            print "drive_input_links: UPDATING INPUT LINK DATA"
            c_m.l_of_l_write(input_links_nested_list, self.link_input_path)
            """ /// """
 def __init__(self, input_url):
     super(Text_Extractor, self).__init__(input_url)
     self.mk_filesystem(self.text_folder)  # creates folder for txt output
     self.page_list = c_m.l_of_l_read(
         self.page_list_path
     )  # retrive page_list from file as list of lists
Esempio n. 7
0
    def __init__(self, root):
        print "__init__: BUILDING GUI"
        self.root = root
        root.title("VERI")
        self.cont = ttk.Frame(root, relief='sunken')

        # __init__: unusual tab positioning is achieved through ttk Style functionality
        self.change_noteb = ttk.Style()
        self.change_tab = ttk.Style()
        self.change_noteb.configure('TNotebook', tabposition='wn')
        self.change_tab.configure('TNotebook.Tab', borderwidth=5)
        #/
        # __init__: ttk widget instantiation
        self.noteb = ttk.Notebook(self.cont)
        self.button_frame = ttk.Frame(self.cont, relief='sunken')
        self.good_button = Button(self.button_frame,
                                  text='VERI_GOOD',
                                  width=20,
                                  height=5,
                                  bg='green',
                                  command=self.veri_good_b)
        self.bad_button = Button(self.button_frame,
                                 text='VERI_BAD',
                                 width=20,
                                 height=5,
                                 bg='red',
                                 command=self.veri_bad_b)
        self.h_light_frame = Frame(self.cont,
                                   relief='raised',
                                   background='yellow',
                                   borderwidth='3')
        self.lang_b_frame = Frame(self.cont, relief='raised')
        self.eng_b = Button(self.lang_b_frame,
                            text='english',
                            relief='raised',
                            command=self.load_eng_text)
        self.original_lang_b = Button(self.lang_b_frame,
                                      text='original',
                                      relief='raised',
                                      command=self.load_original_text)

        # __init__: text box for highlight input
        self.input_header = ttk.Label(self.h_light_frame,
                                      text='TEST_key',
                                      background='yellow')
        self.input_as_text = Text(self.h_light_frame,
                                  width=20,
                                  height=39,
                                  borderwidth=3,
                                  relief='sunken')
        #/

        # __init__: gridding STATIC gui components
        self.cont.grid(column=0, row=0, sticky=(N, W, S, E))
        self.button_frame.grid(column=0, row=0, sticky=(N, W, S, E))
        self.noteb.grid(column=0, row=1, sticky=(N, W, S, E))
        self.good_button.grid(column=0, row=0, sticky=(N, W))
        self.bad_button.grid(column=1, row=0, sticky=(N, W))
        self.lang_b_frame.grid(column=1, row=0, sticky=E)
        self.h_light_frame.grid(column=1, row=1, padx=3)
        self.eng_b.grid(column=0, row=0, sticky=(E), padx=5, pady=3)
        self.original_lang_b.grid(column=0, row=1, sticky=(E), padx=5, pady=3)
        self.input_header.grid(column=0, row=0, sticky=(N, S))
        self.input_as_text.grid(column=0, row=1, sticky=(N, S))
        #/
        # __init__: instances of DYNAMIC notebook tabs
        # max no. == 20 of tabs instances
        self.t_tab0 = Text(self.noteb,
                           width=100,
                           height=40,
                           font=("Arial", "10"))
        self.t_tab1 = Text(self.noteb)
        self.t_tab2 = Text(self.noteb)
        self.t_tab3 = Text(self.noteb)
        self.t_tab4 = Text(self.noteb)
        self.t_tab5 = Text(self.noteb)
        self.t_tab6 = Text(self.noteb)
        self.t_tab7 = Text(self.noteb)
        self.t_tab8 = Text(self.noteb)
        self.t_tab9 = Text(self.noteb)
        self.t_tab10 = Text(self.noteb)
        self.t_tab11 = Text(self.noteb)
        self.t_tab12 = Text(self.noteb)
        self.t_tab13 = Text(self.noteb)
        self.t_tab14 = Text(self.noteb)
        self.t_tab15 = Text(self.noteb)
        self.t_tab16 = Text(self.noteb)
        self.t_tab17 = Text(self.noteb)
        self.t_tab18 = Text(self.noteb)
        self.t_tab19 = Text(self.noteb)

        # __init__: purpose: adding to list for ease of acces
        self.t_tab_list = [
            self.t_tab0, self.t_tab1, self.t_tab2, self.t_tab3, self.t_tab4,
            self.t_tab5, self.t_tab6, self.t_tab7, self.t_tab8, self.t_tab9,
            self.t_tab10, self.t_tab11, self.t_tab12, self.t_tab13,
            self.t_tab14, self.t_tab15, self.t_tab16, self.t_tab17,
            self.t_tab18, self.t_tab19
        ]
        #/
        print "__init__: GUI INSTANCE READY"

        print "__init__: INITIALIZING PATHS AND FILE NAMES"
        self.static_part_path = "site_selector\\site_data\\"
        self.pc_specific_part_path = os.path.dirname(
            os.path.realpath(__file__)) + "\\"
        self.main_path = self.pc_specific_part_path + self.static_part_path

        self.very_ready_f_name = '1_ready_for_veri.txt'
        self.very_ready_f_path = self.main_path + self.very_ready_f_name
        self.very_good_f_name = '2_1_good_veri.txt'
        self.very_good_f_path = self.main_path + self.very_good_f_name
        self.very_bad_f_name = '2_2_bad_veri.txt'
        self.very_bad_f_path = self.main_path + self.very_bad_f_name
        self.all_hlights_f_name = '3_1_all_highlights.txt'
        self.all_hlights_f_path = self.main_path + self.all_hlights_f_name

        print "__init__: ASSIGNING INITIAL VALUES FOR GLOBALY USED DYNAMIC VARIABLES"
        self.eng_text = False  # initial loaded original text
        self.eng_b_second_try = False  # button second click controler
        self.original_b_second_try = True  # button second click controler, True - initial language original
        self.active_link_url_no_domain_list = [
        ]  # for page link without domain later use after adding sum key hit per page in tab header

        print "__init__: INITIAL TABS LOADING AND HIGHLIGHTING"
        #__init__: READING SITE LINK (very_ready) FILE"
        self.veri_ready_list = c_m.l_of_l_read(self.very_ready_f_path)

        #__init__: LOADING INITIAL TAB DATA SET ACCORDING TO FIRST SITE ENTRY IN veri_ready"
        self.init_site_data = self.veri_ready_list[0]
        self.init_tabs_data_set_list = self.mk_tabs_data_set(
            self.init_site_data)

        print "__init__: NUMBER OF PAGES TO BE LOADED %d" % len(
            self.init_tabs_data_set_list)
        self.laod_tabs(self.init_tabs_data_set_list, self.eng_text)

        print "__init__: INITIATING HIGHLIGHTS FUCTIONALITY"

        #__init__: READING HIGHLIGHT KEYWORDS FILE"
        self.init_h_key_list = c_m.l_of_l_read(
            self.all_hlights_f_path)  # returns a list of lists

        self.key_position_lib = {
        }  # initial key position lib for faster highlighting removal

        # handling empty h key list
        if len(self.init_h_key_list) > 0:
            #__init__: KEYWORDS PRESENT IN FILE INITIALIZING HIGHLIGHTING FUNCTIONALITY
            self.test_hi_key_list = self.init_h_key_list[
                0]  # index zero currenlty is test highlights, can be added other, always_positive, always_negative
            #__init__: SHOWING KEYS FROM FILE IN KEYS INPUT FIELD
            self.test_hi_key_list = self.write_key_input_field(
                self.test_hi_key_list
            )  # adjusting for empty items in the text file
            #__init__: HIGHLIGHTING KEYS FROM FILE IN TAB\PAGE TEXT
            self.highlight_keys_in_site(self.test_hi_key_list)

        else:
            #__init__: THE INITIAL HIGHLIGHT KEY LIST IS EMPTY
            self.init_h_key_list = [
                [], [], []
            ]  # assigning initial apropriate format empty list value
            self.test_hi_key_list = self.init_h_key_list[
                0]  # test key list initial empty list value

        # "Key input field Return key binding to highlighting controler method"
        self.input_as_text.bind('<Return>', self.react_to_highlighting_request)
Esempio n. 8
0
    def mk_eng_txt_files(self):
        """ Takes a list of texts parts feeds to translate,
        gets the translated text,
        stacks it together"""

        self.page_list_path
        self.text_folder_path
        main_path = 'E:\\Python_work_files\\Projects\\site_data'
        main_list_f_name = "main_list.txt"
        new_page_list = []

        print "RETRIEVING MAIN_LIST.........."
        main_list = c_m.l_of_l_read(self.page_list_path)

        # iterating throug unique text per page txts
        for data_set in main_list:

            p_text_f_name = data_set[2]
            print "TRANSLATING TEXT FROM FILE %s" % p_text_f_name

            eng_p_text_f_name = "eng_" + p_text_f_name
            error_eng_p_text_f_name = "error_" + eng_p_text_f_name

            page_text = c_m.simply_read(self.text_folder_path, p_text_f_name)

            # if page has less than 10 symbols it is not translated
            if len(page_text) < 10:
                c_m.simply_write(page_text, self.text_eng_folder_path,
                                 eng_p_text_f_name)
                continue

            # initial text output value for while loop
            text_output = ""

            # loop safety paramters
            track_while_loops = 0
            max_while_loops = 5

            # sleep times for not to abuse translate and for every part to work:
            ext_get_sleep = 2
            text_translate_sleep = 1
            text_paste_sleep = 1
            text_copy_to_clip_sleep = 1

            # big while cycle to submit input as long as no output is generated
            while len(text_output) == 0:
                # eternal loop safeguard
                if track_while_loops > max_while_loops:
                    # raise error
                    print "MAXIMUM TRIES TO TO TRANSLATE THE SAMPLE EXEEDED \npath: %s\\%s\\eng_TXT\\%s" % (
                        main_path, self.domain, error_eng_p_text_f_name)
                    error_massage = "error when translating page"
                    c_m.simply_write(error_massage, self.text_eng_folder_path,
                                     error_eng_p_text_f_name)

                # getting translate popup as html page:
                self.driver.get(
                    "chrome-extension://aapbdbdomjkkjkaonfhkkikfgjllcleb/popup.html"
                )
                print "SLEEPING %d seconds, after EXTENSION GET" % ext_get_sleep
                time.sleep(ext_get_sleep)

                print "POPULATING CLIPBOARD with text from file...."
                pyperclip.copy(page_text)
                print "Sleeping after copying to clipboard for %d s" % text_copy_to_clip_sleep
                time.sleep(text_copy_to_clip_sleep)

                # Finding text input element
                text_input_el = self.driver.find_element_by_id('text-input')
                #sending command via selenium Keys
                text_input_el.send_keys(Keys.CONTROL, 'v')
                print "Sleeping after pasting for %d" % text_paste_sleep
                time.sleep(text_paste_sleep)
                #submit to translate
                print "Pressing return"
                text_input_el.send_keys(Keys.RETURN)

                text_output_tries = 0  # initial number of small while loop tries

                # skips find output element and take text when trying for the first time
                if track_while_loops == 0 and eng_p_text_f_name == "eng_page_0_text.txt":
                    track_while_loops += 1
                    continue

                # small while cycle waiting for input to be processed
                while text_output_tries < 5 and not text_output:
                    text_output_tries += 1
                    print "sleeping after submition for %d s. Waiting for text to be translated" % text_translate_sleep
                    time.sleep(text_translate_sleep)

                    #find output element, take text
                    try:
                        text_output_el = self.driver.find_element_by_xpath(
                            '//*[@id="translation"]/div[6]')
                        text_output = text_output_el.text
                        print "TRANS_TEST_1: input sample\n%r" % page_text
                        print "TRANS_TEST_1: text_sample_ouput\n%r" % text_output
                    except:
                        print "Trying for %d time - THE TRANSLATED SAMPLE WAS NOT GENERATED" % text_output_tries

            print "WRITING TRANSLATED OUTPUT TO FILE: ", eng_p_text_f_name
            c_m.simply_write(text_output, self.text_eng_folder_path,
                             eng_p_text_f_name)
            data_set.append(eng_p_text_f_name)
            new_page_list.append(data_set)
            track_while_loops += 1  # incrementing tries
        print "DONE TRANSLATING SITE %s " % self.domain
        print "UPDATING PAGE LIST WITH ENG TEXT FILE NAMES"
        c_m.l_of_l_write(new_page_list, self.page_list_path)
        self.driver.quit()  # working chrome window closing