コード例 #1
0
ファイル: job_categorizer.py プロジェクト: pmavrodiev/jobs
    def get_categories(self, job_url):
        categories_list = []
        # query the tbl_category table
        query = "SELECT * FROM tbl_category WHERE job_url=\'" + job_url + "\'"
        all_categories = self.sqlite_reader.runQuery(query)

        for row in all_categories:
            cat = row[2]
            # split on a hyphen, which is used by jobs.bg to denote
            # subcategories, e.g. ИТ - Административни дейности и продажби
            cat_splitted = self.__split_dashes(cat)
            if len(cat_splitted) == 2:
                leaf = sanitize_id(cat_splitted[1].encode("utf-8"))
                parent = sanitize_id(cat_splitted[0].encode("utf-8"))
                identifier = parent + leaf
                most_similar = self.tp.get_most_similar(identifier)
                # try to find leaf in the category tree
                # if identifier in self.tp:
                if most_similar:
                    # good, category is resolved
                    categories_list.append(most_similar)
                    JobCategorizer.logger.debug("Category %s fully resolved for job %s",
                                 cat, job_url)
                else:
                    # leaf not found, likely misspecified category config.
                    JobCategorizer.logger.warning(("Mismatch in hierarchical structures "
                                    "between database and category file "
                                    "for category %s, job %s"), cat, job_url)
            elif len(cat_splitted) == 1:
                # category is not hyphenated, find it in the tree as it is
                leaf = sanitize_id(cat.encode("utf-8"))
                most_similar = self.tp.get_most_similar(leaf)
                if most_similar in self.tp:
                    # found it, we are done
                    categories_list.append(most_similar)
                    JobCategorizer.logger.debug(("Category %s fully resolved for "
                                  "job %s"), cat, job_url)
                else:
                    # not found, likely misspecified category config.
                    JobCategorizer.logger.warning(("Mismatch in hierarchical structures "
                                    "between database and category file "
                                    "for category %s, job %s"), cat, job_url)
            else:
                # should never happen
                JobCategorizer.logger.error("Impossible category %s. Investigate.", cat)
        # end for row in all_categories
        return categories_list
コード例 #2
0
ファイル: tree_parser.py プロジェクト: pmavrodiev/jobs
    def build_tree(self):
        csv = self.__read_csv_description()
        if not csv:
            return

        # create the ROOT node
        self.add_node("ROOT", alternative_name="ROOT", identifier="root")

        for row in csv:
            last_scanned = None
            # each n in row should be a tree node
            for idx, node in enumerate(row):
                # example: Технологии (Четвъртичен сектор)
                # (node_name, alternative_name)=(Технологии,Четвъртичен сектор)
                (node_name, alternative_name) = self.__tokenize_nodename(node)
                # row[0] should always have ROOT as parent
                parent = "root" if idx == 0 else last_scanned
                unique_id = node_name if parent == "root" else parent + node_name
                self.add_node(node_name, alternative_name,
                              identifier=sanitize_id(unique_id),
                              parent_identifier=parent)
                last_scanned = sanitize_id(unique_id)
                #
        self.initialized = True