Ejemplo n.º 1
0
 def __init__(self, name):
     self.entries = EntriesDataObject.load(name)
     self.model = TVM(self.entries.data, TVMUser.comparator, TVMUser.combinator)
     self.results = {}
Ejemplo n.º 2
0
 def __init__(self, name):
     self.entries = EntriesDataObject.load(name)
     self.model = TVM(self.entries.data, TVMUser.comparator,
                      TVMUser.combinator)
     self.results = {}
Ejemplo n.º 3
0
class TVMUser:

    _EPOCH = datetime(2000, 1, 1)

    @staticmethod
    def comparator(entry):
        diff = entry.date - TVMUser._EPOCH
        return diff.days * 24 * 60 * 60 + diff.seconds
    
    @staticmethod
    def combinator(entry1, entry2):
        return Entry(entry1.id, entry1.date, entry1.data + entry2.data)
    
    year = Period(Period.YEAR_TYPE, 1)
    month = Period(Period.MONTH_TYPE, 1)
    day = Period(Period.DAY_TYPE, 1)
    minute = Period(Period.MINUTE_TYPE, 1)
        
    insig_words = ("i", "to", "and", "the", "you", "of", "this", "is", "it", "t", "my", "on", "for", "as", "ll", "s", "a", "that", "in", "be", "not", "was", "with", "me", "have", "are", "your", "or", "can", "when", "our", "all", "but", "how", "did", "which", "what", "there", "just", "its", "has", "been", "from", "one", "her", "she", "were", "had", "more", "about", "they", "their", "his")
        
    def __init__(self, name):
        self.entries = EntriesDataObject.load(name)
        self.model = TVM(self.entries.data, TVMUser.comparator, TVMUser.combinator)
        self.results = {}
    
    def _apply(self, period, name, function):
        if (period, name) not in self.results:
            self.results[(period, name)] = self.model.apply(period, function)
        return self.results[(period, name)]
        
    def get_length(self, period):
        def length(entry):
            return len(entry.data)
        return self._apply(period, "length", length)

    def get_freq(self, period):
        def freq(entry):
            all_words = split('\W+', entry.data)
            words = {}
            for word in all_words:
                word = word.lower()
                if word in words:
                    words[word] += 1
                else:
                    words[word] = 1
            n = len(all_words)
            freqs = []
            for word, count in words.items():
                freqs.append((word, count * 1.0 / n))
            return entry.date, sorted(freqs, key = lambda x: x[1])[::-1], len(entry.data)
        return self._apply(period, "freq", freq)

    def print_freq(self, period, i):
        result = self.get_freq(period)[i]
    
        def include(word):
            if word.lower() in TVMUser.insig_words:
                return False
            if len(word) <= 2:
                return False
            return True
            
        for item in result:
            freq = item[1]
            date = item[0]
            length = item[2]
            s = str(date.year) + " " + str(date.month) + " | " + str(length) + ": "
            i = 0
            for item in freq:
                word = item[0]
                if include(word):
                    s += word + ", "
                    i += 1
                if i >= 10:
                    break
            print s
        
    @staticmethod
    def splitParagraphIntoSentences(paragraph):
        sentenceEnders = compile(r"""
           # Split sentences on whitespace between them.
           (?:               # Group for two positive lookbehinds.
             (?<=[.!?])      # Either an end of sentence punct,
           | (?<=[.!?]['"])  # or end of sentence punct and quote.
           )                 # End group of two positive lookbehinds.
           (?<!  Mr\.   )    # Don't end sentence on "Mr."
           (?<!  Mrs\.  )
           (?<!  Ms\.   )
           (?<!  Jr\.   )
           (?<!  Dr\.   )
           (?<!  Prof\. )
           (?<!  Sr\.   )
           \s+               # Split on whitespace between sentences.
           """,
            IGNORECASE | VERBOSE)
        return sentenceEnders.split(paragraph)
        
    def get_markov_sentences(self, period):
        def markov_sentences(entry):
            sentences = TVMUser.splitParagraphIntoSentences(entry.data)
            
            sentences_words = []
            for sentence in sentences:
                words = split("\W+", sentence)
                sentences_words.append([word for word in words if len(word) > 0])
                
            result = {}
            for sentence_words in sentences_words:
                for i in range(len(sentence_words) - 1):
                    cur, next = sentence_words[i].lower(), sentence_words[i+1].lower()
                    if cur in result:
                        target_dict = result[cur]
                        if next in target_dict:
                            target_dict[next] += 1
                        else:
                            target_dict[next] = 1
                    else:
                        target_dict = {next:1}
                        result[cur] = target_dict

            return result
        return self._apply(period, "markov_sentences", markov_sentences)
    
    def make_markov_sentences(self, period, i, size=10, seed=None):
        dict = self.get_markov_sentences(period)[i]
        
        n = len(dict)
        if seed is None:
            seed = dict.keys()[int(random() * n)]
            
        def choose_word(word):
            next_words = dict[word]
            total = 0
            for count in next_words.values():
                total += count
            
            selection = random()
            
            bound = 0
            for word, count in next_words.items():
                bound += count / 1.0 / total     
                if bound >= selection:
                    return word
            
        cur_word = seed
        sentence = cur_word
        count = 0
        while cur_word in dict and count < size:
            cur_word = choose_word(cur_word)
            sentence += " " + cur_word
            count += 1
        
        return sentence

    def make_freq_vectors(self, period):
        results = self.get_freq(period)
        
        word_space_map = {}
        word_space_map_inverse = {}
        i = 0
        for result in results:
            for word, _ in result[1]:
                word = word.lower()
                if word not in word_space_map:
                    word_space_map[word] = i
                    word_space_map_inverse[i] = word
                    i += 1
        n = i + 1
        
        dates = []
        freq_vectors = []
        for result in results:
            dates.append(result[0])
            
            vec = [0 for i in range(n)]
            for word, freq in result[1]:
                vec[word_space_map[word]] = freq
                
            freq_vectors.append(vec)
        
        return dates, freq_vectors, word_space_map_inverse
Ejemplo n.º 4
0
class TVMUser:

    _EPOCH = datetime(2000, 1, 1)

    @staticmethod
    def comparator(entry):
        diff = entry.date - TVMUser._EPOCH
        return diff.days * 24 * 60 * 60 + diff.seconds

    @staticmethod
    def combinator(entry1, entry2):
        return Entry(entry1.id, entry1.date, entry1.data + entry2.data)

    year = Period(Period.YEAR_TYPE, 1)
    month = Period(Period.MONTH_TYPE, 1)
    day = Period(Period.DAY_TYPE, 1)
    minute = Period(Period.MINUTE_TYPE, 1)

    insig_words = ("i", "to", "and", "the", "you", "of", "this", "is", "it",
                   "t", "my", "on", "for", "as", "ll", "s", "a", "that", "in",
                   "be", "not", "was", "with", "me", "have", "are", "your",
                   "or", "can", "when", "our", "all", "but", "how", "did",
                   "which", "what", "there", "just", "its", "has", "been",
                   "from", "one", "her", "she", "were", "had", "more", "about",
                   "they", "their", "his")

    def __init__(self, name):
        self.entries = EntriesDataObject.load(name)
        self.model = TVM(self.entries.data, TVMUser.comparator,
                         TVMUser.combinator)
        self.results = {}

    def _apply(self, period, name, function):
        if (period, name) not in self.results:
            self.results[(period, name)] = self.model.apply(period, function)
        return self.results[(period, name)]

    def get_length(self, period):
        def length(entry):
            return len(entry.data)

        return self._apply(period, "length", length)

    def get_freq(self, period):
        def freq(entry):
            all_words = split('\W+', entry.data)
            words = {}
            for word in all_words:
                word = word.lower()
                if word in words:
                    words[word] += 1
                else:
                    words[word] = 1
            n = len(all_words)
            freqs = []
            for word, count in words.items():
                freqs.append((word, count * 1.0 / n))
            return entry.date, sorted(freqs, key=lambda x: x[1])[::-1], len(
                entry.data)

        return self._apply(period, "freq", freq)

    def print_freq(self, period, i):
        result = self.get_freq(period)[i]

        def include(word):
            if word.lower() in TVMUser.insig_words:
                return False
            if len(word) <= 2:
                return False
            return True

        for item in result:
            freq = item[1]
            date = item[0]
            length = item[2]
            s = str(date.year) + " " + str(
                date.month) + " | " + str(length) + ": "
            i = 0
            for item in freq:
                word = item[0]
                if include(word):
                    s += word + ", "
                    i += 1
                if i >= 10:
                    break
            print s

    @staticmethod
    def splitParagraphIntoSentences(paragraph):
        sentenceEnders = compile(
            r"""
           # Split sentences on whitespace between them.
           (?:               # Group for two positive lookbehinds.
             (?<=[.!?])      # Either an end of sentence punct,
           | (?<=[.!?]['"])  # or end of sentence punct and quote.
           )                 # End group of two positive lookbehinds.
           (?<!  Mr\.   )    # Don't end sentence on "Mr."
           (?<!  Mrs\.  )
           (?<!  Ms\.   )
           (?<!  Jr\.   )
           (?<!  Dr\.   )
           (?<!  Prof\. )
           (?<!  Sr\.   )
           \s+               # Split on whitespace between sentences.
           """, IGNORECASE | VERBOSE)
        return sentenceEnders.split(paragraph)

    def get_markov_sentences(self, period):
        def markov_sentences(entry):
            sentences = TVMUser.splitParagraphIntoSentences(entry.data)

            sentences_words = []
            for sentence in sentences:
                words = split("\W+", sentence)
                sentences_words.append(
                    [word for word in words if len(word) > 0])

            result = {}
            for sentence_words in sentences_words:
                for i in range(len(sentence_words) - 1):
                    cur, next = sentence_words[i].lower(), sentence_words[
                        i + 1].lower()
                    if cur in result:
                        target_dict = result[cur]
                        if next in target_dict:
                            target_dict[next] += 1
                        else:
                            target_dict[next] = 1
                    else:
                        target_dict = {next: 1}
                        result[cur] = target_dict

            return result

        return self._apply(period, "markov_sentences", markov_sentences)

    def make_markov_sentences(self, period, i, size=10, seed=None):
        dict = self.get_markov_sentences(period)[i]

        n = len(dict)
        if seed is None:
            seed = dict.keys()[int(random() * n)]

        def choose_word(word):
            next_words = dict[word]
            total = 0
            for count in next_words.values():
                total += count

            selection = random()

            bound = 0
            for word, count in next_words.items():
                bound += count / 1.0 / total
                if bound >= selection:
                    return word

        cur_word = seed
        sentence = cur_word
        count = 0
        while cur_word in dict and count < size:
            cur_word = choose_word(cur_word)
            sentence += " " + cur_word
            count += 1

        return sentence

    def make_freq_vectors(self, period):
        results = self.get_freq(period)

        word_space_map = {}
        word_space_map_inverse = {}
        i = 0
        for result in results:
            for word, _ in result[1]:
                word = word.lower()
                if word not in word_space_map:
                    word_space_map[word] = i
                    word_space_map_inverse[i] = word
                    i += 1
        n = i + 1

        dates = []
        freq_vectors = []
        for result in results:
            dates.append(result[0])

            vec = [0 for i in range(n)]
            for word, freq in result[1]:
                vec[word_space_map[word]] = freq

            freq_vectors.append(vec)

        return dates, freq_vectors, word_space_map_inverse