Exemple #1
0
def _obtain_candidate_keywords(list_all_dialogs, candi_kw_path, min_kw_freq=1, load_file_if_exists=True):
    r"""Obtain and save the candidate keywords used for extracting keywords.

    Inputs: list_all_dialogs, candi_kw_path, load_file_if_exists
        # TODO
        - **list_all_dialogs**:
        - **candi_kw_path**:
        - **load_file_if_exists**:

    Outputs: candi_keywords
        - **candi_keywords**:  a 'list' containing all the candidate keywords
    """
    if load_file_if_exists:
        if os.path.isfile(candi_kw_path):
            with open(candi_kw_path,'r') as f:
                candi_keywords = [kw.strip() for kw in f.readlines()]
            print('Loading candidate keywords from {}'.format(candi_kw_path))
            print('Total candidate keywords count: ', len(candi_keywords))
            return candi_keywords

    if not list_all_dialogs:
        raise Exception('no dialogs provided for obtaining candidate keywords')

    candi_kw_dir = os.path.dirname(candi_kw_path)
    if not os.path.exists(candi_kw_dir):
        os.makedirs(candi_kw_dir)

    print('Obtaining candidate keywords...')

    # initialization
    candi_keywords = []
    kw_counter = collections.Counter()
    kw_extractor = KeywordExtractor()

    # extract possible keywords
    for dialog in tqdm(list_all_dialogs):
        for utterance in dialog:
            cur_keywords = kw_extractor.candi_extract(utterance)
            kw_counter.update(cur_keywords)
            candi_keywords.extend(cur_keywords)

    # delete the keywords occurring less than specified times (indicated by 'min_kw_freq').
    rare_keywords = [kw for kw, freq in kw_counter.most_common() if freq < min_kw_freq]
    candi_keywords = [kw for kw, freq in kw_counter.most_common() if freq >= min_kw_freq]
    # delete keywords containing only one single letter
    single_letter_keywords = [kw for kw in candi_keywords if len(kw) < 2]
    candi_keywords = [kw for kw in candi_keywords if len(kw) >= 2]

    # print the information of candidate keywords
    print('rare keywords count: ', len(rare_keywords))
    print('single letter keywords count: ', len(single_letter_keywords))
    print('total candidate keywords count(before cleaning): ', len(kw_counter.items()))
    print('total candidate keywords count(after cleaning):  ', len(candi_keywords))

    print('Saving candi_keywords into {}...'.format(candi_kw_path))
    with open(candi_kw_path,'w') as f:
        for keyword in candi_keywords:
            f.write(keyword + '\n')

    return candi_keywords
    def _obtain_candidate_keywords(self, load_file_if_exists=True):
        r"""Obtains and saves the candidate keywords used for extracting keywords.

        Args:
            load_file_if_exists: A 'bool' indicating whether load candi_keywords file if it exists.

        Returns:
            candi_keywords: A 'list' containing all the candidate keywords.
        """
        if load_file_if_exists:
            candi_keywords_name = '../data/{}/candi_keywords.txt'.format(self.output_data_dir)
            if os.path.isfile(candi_keywords_name):
                with open(candi_keywords_name,'r') as f:
                    candi_keywords = [kw.strip() for kw in f.readlines()]
                print('Loading candidate keywords from {}'.format(candi_keywords_name))
                print('Total candidate keywords count: ', len(candi_keywords))
                return candi_keywords

        print('Obtaining candidate keywords...')

        # Initialization
        candi_keywords = []
        kw_counter = collections.Counter()
        kw_extractor = KeywordExtractor()

        # Extracts possible keywords.
        for dialog in tqdm(self.list_all_dialogs):
            for utterance in dialog:
                cur_keywords = kw_extractor.candi_extract(utterance)
                kw_counter.update(cur_keywords)
                candi_keywords.extend(cur_keywords)

        # Deletes the keywords occurring less than specified times
        rare_keywords = [kw for kw, freq in kw_counter.most_common()
            if freq < self.min_kw_freq]
        candi_keywords = [kw for kw, freq in kw_counter.most_common()
            if freq >= self.min_kw_freq]
        # Deletes keywords containing only one single letter
        single_letter_keywords = [kw for kw in candi_keywords if len(kw) < 2]
        candi_keywords = [kw for kw in candi_keywords if len(kw) >= 2]

        # Writes candidate keywords into file
        candidate_keywords_output_path = '../data/{}/candi_keywords.txt'.format(
            self.output_data_dir)
        with open(candidate_keywords_output_path,'w') as f:
            for keyword in candi_keywords:
                f.write(keyword + '\n')

        return candi_keywords