コード例 #1
0
    def veri_good_b(self):
        """ Resets second try states,
        repopulates tabs with next site data,
        removes current site link from veri_ready and
        adds to veri_good file
        highlights newly added text"""

        veri_good_site_status = "VERI_GOOD"

        print "IN: veri_good_b. VERI GOOD BUTTON CLICKED"
        print "veri_good_b: REPOPULATING TABS WITH NEXT SITE DATA"

        #veri_good_b: RESETTING SECOND TRY MARKERS"
        self.eng_b_second_try = False  # clearing english button tries
        self.original_b_second_try = False  # clearing original button tries

        old_tab_data_set = self.init_tabs_data_set_list
        #veri_good_b: DELETING OLD TABS
        self.delete_tabs(old_tab_data_set)
        self.veri_ready_list[0][1] = veri_good_site_status
        current_site_data_entry = self.veri_ready_list[0]

        #veri_good_b: ADDING CLICKED AS GOOD SITE DATA ENTRY TO VERI GOOD FILE
        c_m.txt_file_append(current_site_data_entry, self.very_good_f_path)

        #veri_good_b: GOING TO THE NEXT SITE DATA ENTRY IN VERY READI LIST
        self.veri_ready_list.pop(0)
        next_site_data_entry = self.veri_ready_list[0]

        #veri_good_b: LOADING NEXT SITE TABS
        #veri_good_b: using global init_tabs_data_set_list parameter to delete tabs after button click
        self.init_tabs_data_set_list = self.mk_tabs_data_set(
            next_site_data_entry)
        self.laod_tabs(self.init_tabs_data_set_list, self.eng_text)

        print "veri_good_b: HIGHLIGHTING NEW SITE KEYWORDS"
        self.highlight_keys_in_site(self.test_hi_key_list)

        #veri_good_b: UPDATING VERI READY FILE WITH FIRST DELETED ENTRY VERI READY LIST
        c_m.l_of_l_write(self.veri_ready_list, self.very_ready_f_path)
コード例 #2
0
    def check_if_duplicate(self):

        # loads lists from ready, verified as good and bad text files
        print "READing VERI READY LIST"
        veri_ready_list = c_m.l_of_l_read(self.vr_file_path)
        print "READ VERI GOOD"
        veri_good_list = c_m.l_of_l_read(self.vg_file_path)
        print "READ VERI BAD"
        veri_bad_list = c_m.l_of_l_read(self.vb_file_path)

        #/
        print "MERGING TO ALL ENTRIES LISTS......"
        old_entries_list = veri_ready_list + veri_good_list + veri_bad_list  # adding all lists into one

        print "EXTRACTING BASE URL FOR DUPLI CHECKING END ID AS LINK"
        old_base_url_id_dict = {}
        for entry in old_entries_list:
            current_site_link_id = entry[0]

            print "CURRENT ENTRY", entry
            current_site_base_url = entry[3]
            old_base_url_id_dict[current_site_base_url] = current_site_link_id

            self.old_id_list.append(
                current_site_link_id)  # for next id generation

        try:
            old_base_url_id_dict[self.base_url]
            print "DUPLICATE LINK FOUND"
            print "ADDING TO DUPLICATE link FILE\npath:%s" % self.dupli_file_path
            dupli_entry_pack = [
                old_base_url_id_dict[self.base_url], "DUPLICATE_BASE_URL",
                self.base_url
            ]
            c_m.txt_file_append(dupli_entry_pack, self.dupli_file_path)
            return True
        except KeyError:
            print "INPUT SITE LINK IS UNIQUE"
            return False
コード例 #3
0
    def drive_input_links(self):
        """ DRIVES INPUT LINKS AND MODULES,
            FOR PARTIAL FUNCTIONALITY COMMENT OUT MARKED MODULES"""
        # driving links
        input_links_nested_list = [[], []]

        while len(input_links_nested_list) > 0:
            # retieving input_link_list
            input_links_nested_list = c_m.l_of_l_read(self.link_input_path)
            try:
                url = input_links_nested_list[0][
                    1]  #input link in nested list type data table.
                # will raise out of range error if empty input link file is provided.
            except IndexError:
                print "drive_input_links: IndexError: MOST LIKELY INPUT FILE IS EMPTY, ADD SOME SITE LINKS"
                break

            print "drive_input_links: INPUT URL:", url

            # driving modules
            site_selector = Common_Paths(url)  # System module

            # ---COMMENT OUT if you do not need SITE LINK MANAGEMENT AND INPUT URL DUBLI CHECK:
            # site_link_manager_sys = Site_Link_Manager(url)
            # is_duplicate = site_link_manager_sys.check_if_duplicate()
            # if is_duplicate:
            # print "drive_input_links: INPUT SITE LINK IS DUPLICATE, TAKING NEXT LINK"
            # input_links_nested_list.pop(0)
            # c_m.l_of_l_write(input_links_nested_list, self.link_input_path)
            # continue
            # print "drive_input_links: DUBLI CHECK DONE"
            #///

            # ---COMMENT OUT if you do not need HTML RETRIEVAL
            html_maker = HTML_File_Maker(url)
            site_bs_object_dict = html_maker.mk_htmls(
            )  # output html bs_object dict for next module
            if site_bs_object_dict is False:
                failed_entry = ["FAILED", url]
                c_m.txt_file_append(failed_entry, self.site_failed_path)
                input_links_nested_list.pop(0)
                continue
            print "drive_input_links: HTML FILES DONE BS OBJECTS READY"
            #///

            # ---COMMENT OUT if you do not need TEXT EXTRACTION
            txt_maker = Text_Extractor(url)
            txt_maker.mk_text_files(site_bs_object_dict)
            print "drive_input_links: TEXT FILE MAKER DONE"
            #///

            # ---COMMENT OUT if you do not need ENGLISH TRANSLATION
            translator = Translator(url)
            translator.mk_eng_txt_files(rewrite=False)
            print "drive_input_links: TRANSLATOR DONE"
            #///

            # ---COMMENT OUT do not need VERI_GOOD FILE UPDATE.
            # print "drive_input_links: ADDING TO VERI GOOD FILE"
            # site_link_manager_sys.add_to_veri_good()
            # print "drive_input_links: LINK DONE, POPING..."
            #///
            """ !!! POP INPUT LINK AND UPDATE INPUT LINK FILE NECESARRY FOR NON ETERNAL LOOP """
            input_links_nested_list.pop(0)
            print "drive_input_links: UPDATING INPUT LINK DATA"
            c_m.l_of_l_write(input_links_nested_list, self.link_input_path)
            """ /// """
コード例 #4
0
 def add_to_veri_good(self):
     next_id = self.make_next_id()
     entry_data_pack = [
         next_id, "READY_FOR_VERI", self.input_url, self.base_url
     ]
     c_m.txt_file_append(entry_data_pack, self.vr_file_path)