def run(corpus_file_object=None, keep_case=False, max_word_tokens=0): unigrams_counter = Counter() bigrams_counter = Counter() trigrams_counter = Counter() current_word_token_count = 0 for line in corpus_file_object: if max_word_tokens and current_word_token_count > max_word_tokens: break line = fix_punctuations(line).strip() if not keep_case: line = line.casefold() words = line.split() if not words: continue current_word_token_count += len(words) unigrams_of_line = words bigrams_of_line = zip(*[words[i:] for i in range(2)]) trigrams_of_line = zip(*[words[i:] for i in range(3)]) unigrams_counter.update(unigrams_of_line) bigrams_counter.update(bigrams_of_line) trigrams_counter.update(trigrams_of_line) return dict(unigrams_counter), dict(bigrams_counter), dict(trigrams_counter)
def run(corpus_file_object=None, keep_case=False, max_word_tokens=0): unigrams_counter = Counter() bigrams_counter = Counter() trigrams_counter = Counter() current_word_token_count = 0 for line in corpus_file_object: if max_word_tokens and current_word_token_count > max_word_tokens: break line = fix_punctuations(line).strip() if not keep_case: line = line.lower() words = line.split() if not words: continue current_word_token_count += len(words) unigrams_of_line = words bigrams_of_line = zip(*[words[i:] for i in range(2)]) trigrams_of_line = zip(*[words[i:] for i in range(3)]) unigrams_counter.update(unigrams_of_line) bigrams_counter.update(bigrams_of_line) trigrams_counter.update(trigrams_of_line) return (dict(unigrams_counter), dict(bigrams_counter), dict(trigrams_counter))
def __init__(self, file_path=None, wordlist_file=False, corpus_object=None, wordlist_object=None, encoding=ENCODING, **kwargs): self.file_abspath = self._check_file_path(file_path) if self.file_abspath is None: self.directory = None else: self.directory = os.path.dirname(self.file_abspath) self.file_is_wordlist = wordlist_file self.encoding = encoding self.corpus_object = corpus_object self.wordlist_object = wordlist_object self.parameters_ = self._determine_parameters(**kwargs) # number of word types and tokens self._number_of_word_types = None self._number_of_word_tokens = None # word ngrams self._word_unigram_counter = None self._word_bigram_counter = None self._word_trigram_counter = None # wordlist self._wordlist = None if self.wordlist_object is not None: # self.wordlist_object is # either an iterable or a dict of word-count pairs if type(self.wordlist_object) is dict: word_count_dict = dict() if self.parameters_['keep_case']: word_count_dict = self.wordlist_object else: for word, count in self.wordlist_object: word = word.lower() if word not in word_count_dict: word_count_dict[word] = 0 word_count_dict[word] += count self._wordlist = [ word_ for word_, _ in double_sorted(word_count_dict.items(), key=lambda x: x[1], reverse=True) ] self._word_unigram_counter = word_count_dict elif hasattr(self.wordlist_object, '__iter__'): if self.parameters_['keep_case']: self._wordlist = sorted(set(self.wordlist_object)) else: self._wordlist = sorted( set(w.lower() for w in self.wordlist_object)) self._word_unigram_counter = {w: 1 for w in self._wordlist} else: raise TypeError('wordlist object must be a dict of word-count' 'pairs or an iterable of words') # corpus file object if self.corpus_object is not None: # self.corpus_object is either a list of strings or a long str if type(self.corpus_object) is list: corpus_str = fix_punctuations(' '.join(self.corpus_object)) elif type(self.corpus_object) is six.text_type: corpus_str = fix_punctuations(self.corpus_object) else: raise TypeError('corpus object must be either a text or list') self.corpus_file_object = StringIO(corpus_str) elif self.file_abspath and not self.file_is_wordlist: self.corpus_file_object = open(self.file_abspath, encoding=self.encoding) else: self.corpus_file_object = None # wordlist file object if self.file_is_wordlist: self.wordlist_file_object = open(self.file_abspath, encoding=self.encoding) else: self.wordlist_file_object = StringIO() # manifold-related objects self._words_to_neighbors = None self._words_to_contexts = None self._contexts_to_words = None self._neighbor_graph = None # phon objects self._phone_unigram_counter = None self._phone_bigram_counter = None self._phone_trigram_counter = None self._phone_dict = None self._biphone_dict = None self._word_dict = None self._words_to_phones = None # trie objects self._broken_words_left_to_right = None self._broken_words_right_to_left = None self._successors = None self._predecessors = None Lexicon_BiSig.__init__(self, self.wordlist(), self.parameters_['min_stem_length'], self.parameters_['max_affix_length'], self.parameters_['min_sig_count'], self.parameters_['suffixing'])
def _initialize(self): # number of word types and tokens self._number_of_word_types = None self._number_of_word_tokens = None # word ngrams self._word_unigram_counter = None self._word_bigram_counter = None self._word_trigram_counter = None # wordlist self._wordlist = None if self.wordlist_object is not None: # self.wordlist_object is # either an iterable or a dict of word-count pairs if type(self.wordlist_object) is dict: word_count_dict = dict() if self.parameters_['keep_case']: word_count_dict = self.wordlist_object else: for word, count in self.wordlist_object: word = word.lower() if word not in word_count_dict: word_count_dict[word] = 0 word_count_dict[word] += count self._wordlist = [word for word, _ in double_sorted(word_count_dict.items(), key=lambda x: x[1], reverse=True)] self._word_unigram_counter = word_count_dict elif hasattr(self.wordlist_object, '__iter__'): if self.parameters_['keep_case']: self._wordlist = sorted(set(self.wordlist_object)) else: self._wordlist = sorted( set(w.lower() for w in self.wordlist_object)) self._word_unigram_counter = {w: 1 for w in self._wordlist} else: raise TypeError('wordlist object must be a dict of word-count' 'pairs or an iterable of words') # signature-related objects self._stems_to_words = None self._signatures_to_stems = None self._stems_to_signatures = None self._words_to_signatures = None self._signatures_to_words = None self._words_to_sigtransforms = None self._signatures = None self._affixes_to_signatures = None self._words_in_signatures = None self._affixes = None self._stems = None # corpus file object if self.corpus_object is not None: # self.corpus_object is either a list of strings or a long str if type(self.corpus_object) is list: corpus_str = fix_punctuations(' '.join(self.corpus_object)) elif type(self.corpus_object) is str: corpus_str = fix_punctuations(self.corpus_object) else: raise TypeError('corpus object must be either a str or a list') self.corpus_file_object = StringIO(corpus_str) elif self.file_abspath and not self.file_is_wordlist: self.corpus_file_object = open(self.file_abspath, encoding=self.encoding) else: self.corpus_file_object = None # wordlist file object if self.file_is_wordlist: self.wordlist_file_object = open(self.file_abspath, encoding=self.encoding) else: self.wordlist_file_object = StringIO() # manifold-related objects self._words_to_neighbors = None self._words_to_contexts = None self._contexts_to_words = None self._neighbor_graph = None # phon objects self._phone_unigram_counter = None self._phone_bigram_counter = None self._phone_trigram_counter = None self._phone_dict = None self._biphone_dict = None self._word_dict = None self._words_to_phones = None # trie objects self._broken_words_left_to_right = None self._broken_words_right_to_left = None self._successors = None self._predecessors = None