def mimic_dict(self):
        """
        Returns mimic dict mapping each word to list of words which follow it.
        """

        file_input = open(self._filename, 'r')
        string_input = file_input.read()
        file_input.close()
        string_tool = StringTool()
        string_cleaned = string_tool.clean_string(string_input)
        # this leaves contraction apostrophe in word
        words_input = re.split(' +', string_cleaned)

        # Make a new list, skipping any empty words.
        # Note small.txt yields words_input with last element ''.
        # list comprehension
        # http://stackoverflow.com/questions/1450111/delete-many-elements-of-list-python
        words_cleaned = [ item for item in words_input if (item is not '') ]

        output_dict = {}

        for index in range(0, len(words_cleaned)):
            current_word = words_cleaned[index]

            if ((len(words_cleaned) - 1) == index):
                # current_word is the last word, no next word.
                # if current_word is in keys, do nothing
                # if current_word isn't in keys, add it as a key with an empty list
                if (not current_word in output_dict.keys()):
                    output_dict[current_word] = []

            else:
                # we aren't on the last word, so it's safe to reference next word
                next_word = words_cleaned[index + 1]

                if current_word in output_dict.keys():
                    # append to existing list
                    output_dict[current_word].append(next_word)
                else:
                    # add new key-value pair, use trailing comma to define new list
                    current_list = [next_word,]
                    output_dict[current_word] = current_list

        print('output_dict')
        print(output_dict)
        print()
        return output_dict
def count_words(filename):
    string_tool = StringTool()
    words_file = open(filename, 'r')
    word_counts = {}
    for line in words_file:
        cleaned_string = string_tool.clean_string(line)
        # split on whitespace
        line_list = cleaned_string.split()
        for word in line_list:
            word_lower = word.lower()
            if not (word_lower in word_counts):
                word_counts[word_lower] = 1
            else:
                word_counts[word_lower] += 1

    words_file.close()
    return word_counts
Example #3
0
 def str_to_datetime(cls, s):
     """将日期字符串转换为 datetime 类型, 转换失败返回 None"""
     if isinstance(s, sys_datetime.date):
         return cls.date_to_datetime(s)
     if isinstance(s, sys_datetime.datetime):
         return s
     s = StringTool.s(s)
     if not isinstance(s, str) or not s:
         return None
     try:
         return parser.parse(s)
     except Exception, e:
         log.error('TimeTool str_to_datetime error %s, [%s]' % (e, s))
         return None