def find_abbreviations(): import db from tokenizers import es from nltk import FreqDist corpus = db.connect() #text = '\n'.join([a['text'] for a in corpus.articles.find().limit(10)]) text = '\n'.join([a['text'] for a in corpus.articles.find()]) tokens = es.tokenize(text, ignore_abbreviations=True) fd = FreqDist() fd_abbr = FreqDist() fd_n_abbr = FreqDist() n_tokens = len(tokens) for i in range(n_tokens): fd.inc(tokens[i]) if i < (n_tokens - 1) and tokens[i + 1] == u'.': fd_abbr.inc(tokens[i]) else: fd_n_abbr.inc(tokens[i]) adjusted = {} f_avg = len(fd.keys()) / fd.N() for t, n in fd_abbr.iteritems(): f = fd.get(t, 0) / fd.N() deviation = 1 + (f - f_avg) adjusted[t] = n * deviation / fd_n_abbr.get(t, 1) / len(t) items = adjusted.items() items.sort(key=lambda i: i[1], reverse=True) for t, n in items[:100]: print u'%s. %f (%d, %d)' % (t, n, fd_abbr[t], fd_n_abbr.get(t, 0))
def work_1(): file_string = "" txt_file = open("trabalho1.txt", "r+") csv_file = open("trabalho1.csv", "w+") csv_manage = csv.writer(csv_file, delimiter=";", quoting=csv.QUOTE_MINIMAL) base_text = txt_file.read() sentences = word_tokenize(base_text) frequency = FreqDist(sentences) print("texto : {0}".format(base_text)) print("Total de palavras : {0}".format(frequency.N())) print("Total de Termos : {0}".format(len(frequency.keys()))) print("") print("Tabela de Frequência de Termos") print("") for key in frequency.keys(): csv_manage.writerow([key, str(frequency.get(key))]) print("Termo: {0} Total: {1}".format(key, str(frequency.get(key)))) pdfOutput = PdfOutput(frequency, frequency.N(), len(frequency.keys()), base_text) servicePdfManager = ServiceManagerPdf() servicePdfManager.writePdf(pdfOutput) txt_file.close() csv_file.close()
def calculate_nb_probabilities(): ## GOAL: Populate these two dicts, where each ## key = word from poswords or negwords (created for you above) ## value = NB probability for that word in that class (calculated by you here) poswordprobs = {} negwordprobs = {} ######################################### ##### YOUR PART B CODE STARTS HERE ###### ######################################### ## Create a FreqDist for poswords below. freqDistPos = FreqDist(poswords) countpos = len(freqDistPos) ## Create a FreqDist for negwords below. freqDistNeg = FreqDist(negwords) countneg = len(freqDistNeg) ## Loop through your poswords FreqDist, and calculate the ## probability of each word in the positive class, like this: ## P(word|pos) = count(word) / total number of positive tokens ## where count(word) is what you get from the FreqDist for poswords. ## Store the results in poswordprobs. ## USE LOGS!!! for word in freqDistPos: value = freqDistPos.get(word) prob = math.log(value / countpos) poswordprobs[word] = prob ## Now, loop through your negwords FreqDist, and calculate the ## probability of each word in the negative class, like this: ## P(word|neg) = count(word) / total number of negative tokens ## where count(word) is what you get from the FreqDist for negwords. ## Store the results in negwordprobs. ## USE LOGS!!! for word in freqDistNeg: value = freqDistNeg.get(word) prob = math.log(value / countneg) negwordprobs[word] = prob ######################################### ##### YOUR PART B CODE ENDS HERE ######## ######################################### return (poswordprobs, negwordprobs)
def featureset(sample): comment, label = sample features = {} # tags = map(lambda statement: map(lambda (w,t):t, statement), comment) words = map(lambda statement: map(lambda (w,t):w, statement), comment) words = sum(words, []) # tags = sum(tags, []) size_= sum([len(word) for word in words]) features['stmt_len'] = len(words)/float(len(comment)) features['word_len'] = size_/float(len(words)) features['size'] = size_ # tags_dist = FreqDist(sum(tags, [])) # for tag in TAGS: # features[tag] = tags_dist.get(tag, 0) dist = FreqDist([word.lower() for word in words]) # num_stop_words = float(sum([dist.get(word, 0) for word in EN_STOPWORDS])) # features['prob_stop_words'] = num_stop_words/len(words) for word in EN_STOPWORDS: features[word] = dist.get(word, 0)/float(len(words)) features['alwayson'] = 1.0 for language in LANGUAGES: for i in range(1,n+1): word_sim, tag_sim, char_sim, w_s_sim = comment_similarity(GRAMS[language], comment, i) features['w_sim_%d_%s' % (i, language)] = word_sim features['t_sim_%d_%s' % (i, language)] = tag_sim features['c_sim_%d_%s' % (i, language)] = char_sim # features['s_sim_%d_%s' % (i, language)] = w_s_sim return (features, label)
def featureset(sample): comment, label = sample features = {} # tags = map(lambda statement: map(lambda (w,t):t, statement), comment) words = map(lambda statement: map(lambda (w, t): w, statement), comment) words = sum(words, []) # tags = sum(tags, []) size_ = sum([len(word) for word in words]) features['stmt_len'] = len(words) / float(len(comment)) features['word_len'] = size_ / float(len(words)) features['size'] = size_ # tags_dist = FreqDist(sum(tags, [])) # for tag in TAGS: # features[tag] = tags_dist.get(tag, 0) dist = FreqDist([word.lower() for word in words]) # num_stop_words = float(sum([dist.get(word, 0) for word in EN_STOPWORDS])) # features['prob_stop_words'] = num_stop_words/len(words) for word in EN_STOPWORDS: features[word] = dist.get(word, 0) / float(len(words)) features['alwayson'] = 1.0 for language in LANGUAGES: for i in range(1, n + 1): word_sim, tag_sim, char_sim, w_s_sim = comment_similarity( GRAMS[language], comment, i) features['w_sim_%d_%s' % (i, language)] = word_sim features['t_sim_%d_%s' % (i, language)] = tag_sim features['c_sim_%d_%s' % (i, language)] = char_sim # features['s_sim_%d_%s' % (i, language)] = w_s_sim return (features, label)
def check_marks(a,b): from nltk import ConditionalFreqDist, FreqDist s = a.split() c=0 fd = FreqDist(s) for i in range(0,len(b)): if fd.get(b[i]) != None : c=c+0.5 return c
def diccionario_bigramPalabras(): # Lectura y transformación de Corpus corpus = PlaintextCorpusReader("Corpus", '.*') tokenizer = RegexpTokenizer(r'[a-zA-Záéíóúñ]+') tokens = tokenizer.tokenize(corpus.raw()) # Generación diccionario bigram de palabras + frecuencia bigrams_orig = bigrams(tokens) fdist = FreqDist(bigrams_orig) dict_bigrams = {} for b in fdist: b_tr = (b[0], traducciones.traduce_numerico(b[1])) try: if dict_bigrams[b_tr][1] < fdist.get(b): dict_bigrams[b_tr] = [b, fdist.get(b)] except: dict_bigrams[b_tr] = [b, fdist.get(b)] return dict_bigrams
def get_uni(first, second, uni): bigramfdist = FreqDist() for line in first: token = nltk.word_tokenize(line) token = [ x for x in token if not re.fullmatch('[' + string.punctuation + ']+', x) ] bigrams = ngrams(token, 1) bigramfdist.update(bigrams) print(bigramfdist.most_common(50)) print(bigramfdist.get("but"))
def collectFreqData(file_name, initalkeys=[]): #inital keys for use in tf-idf. if (initalkeys is not None and len(initalkeys) > 0): fqdist = initalizeFreqDistWithKeys(initalkeys) else: fqdist = FreqDist() if not path.exists(file_name): print("no file or no data for: " + file_name) return FreqDist() with open(file_name, "r") as current_file: for line in current_file: for word in line.split(): fqdist[word] = fqdist.get(word, 0) + 1 fqdist["<end_comment>"] = 0 for word in initalkeys: if (fqdist.get(word, 0) == 0): fqdist[word] = 0 return fqdist
def calculate_smooth_nb_probabilities(): smooth_poswordprobs = {} smooth_negwordprobs = {} ######################################### ##### YOUR PART C CODE STARTS HERE ###### ######################################### # Populate the above dictionaries just as you did in the unsmoothed # version, but use +1 smoothing so that you can handle unseen words. freqDistPosSmooth = FreqDist(poswords) freqDistNegSmooth = FreqDist(negwords) # +1 smoothing: when calculating the probabilities, # add 1 to every count found in the FreqDist for each class. # Divide the count by the number of types... # *plus* the number of tokens for that class... # *plus* 1 (for the count of the unseen word) # Don't forget to use logs. typesP = len(set(poswords)) tokensP = len(poswords) typesN = len(set(negwords)) tokensN = len(negwords) for word in freqDistPosSmooth: valueSmooth = freqDistPosSmooth.get(word) countposSmooth = valueSmooth + 1 probSmooth = math.log(countposSmooth / (typesP + tokensP + 1)) smooth_poswordprobs[word] = probSmooth for word in freqDistNegSmooth: valueSmooth = freqDistNegSmooth.get(word) countnegSmooth = valueSmooth + 1 probSmooth = math.log(countnegSmooth / (typesN + tokensN + 1)) smooth_negwordprobs[word] = probSmooth return (smooth_poswordprobs, smooth_negwordprobs)
def transfer(fileDj,vocabulary): fo=open(fileDj,"r") content=fo.read() tokens=nltk.word_tokenize(content) # st=[SBStemmer.stem(t) for t in tokens] st=tokens fo.close() fdist=FreqDist(st) BOWDj = [] for key in vocabulary: if key in fdist.keys(): BOWDj.append(fdist.get(key)) else: BOWDj.append(0) return BOWDj
class Article: def __init__(self, body, category='Unknown'): self.body = body self.category = category self.words = word_tokenize(self.body) self.sentences = sent_tokenize(self.body) self.word_tags = pos_tag(self.words) self.frequencies = FreqDist([i[1] for i in self.word_tags]) self.reduced_frequencies = {} self.features_reduction() def get_frequency(self, feature_name): try: return int(self.frequencies.get(feature_name)) except TypeError: return 0 def features_reduction(self): self.reduced_frequencies['adjectives'] = self.get_frequency('JJ') + self.get_frequency( 'JJR') + self.get_frequency('JJS') self.reduced_frequencies['adverbs'] = self.get_frequency('RB') + self.get_frequency('RBR') + self.get_frequency( 'RBS') self.reduced_frequencies['articles'] = self.get_frequency('DT') self.reduced_frequencies['conjunctions'] = self.get_frequency('CC') self.reduced_frequencies['interjections'] = self.get_frequency('UH') self.reduced_frequencies['nouns'] = self.get_frequency('NN') + self.get_frequency('NNS') + self.get_frequency( 'NNP') + self.get_frequency('NNPS') self.reduced_frequencies['numerals'] = self.get_frequency('CD') self.reduced_frequencies['past_part'] = self.get_frequency('VBN') self.reduced_frequencies['prepositions'] = self.get_frequency('IN') self.reduced_frequencies['pronouns'] = self.get_frequency('PRP') + self.get_frequency('PRP$') self.reduced_frequencies['punctuation'] = self.get_frequency('``') + self.get_frequency( '\'\'') + self.get_frequency('(') + self.get_frequency(')') + self.get_frequency(',') + self.get_frequency( '--') + self.get_frequency('.') + self.get_frequency(':') self.reduced_frequencies['symbols'] = self.get_frequency('SYM') def avg_word_length(self): return mean([len(i) for i in self.words])
def unigramFreqFile(subreddit): # get filtered files filenames = getTextFileNames(subreddit) countFileName = getCountFileName(subreddit) with open(countFileName, "a+", errors='ignore') as countVectorFile: frequencies = FreqDist() for filename in filenames: print("sending normalized values of " + filename + " to " + countFileName) with open(filename, "r", errors="ignore") as current_file: for line in current_file: for word in line.split(): word = word.strip() if word.startswith("http") or word.isnumeric(): continue if 0 < len(word) < 23: frequencies[word] = frequencies.get(word, 0) + 1 frequencies["<end_comment>"] = 0 # write total number of words countVectorFile.write(str(frequencies.N())) for word in frequencies: countVectorFile.write(word+" "+str(frequencies[word])+"\n")
def bigramFreqFile(subreddit): #get filtered files filenames = getTextFileNames(subreddit) countfilename = getCountFileName(subreddit, unigram=False) with open(countfilename, "a+", errors='ignore') as countVectorFile: frequencies = FreqDist() #good canidate for multithreading. one thread for file, each with own freq dist, combo after all finish. for filename in filenames: print("sending normalized values of " + filename + " to " + countfilename) with open(filename, "r", errors="ignore") as current_file: for line in current_file: for bigram in list(bigrams(line.split())): okayrange = 0 < len(bigram[0]) < 23 and 0 < len(bigram[1]) < 23 if okayrange and bigram[1] != "<end_comment>": frequencies[bigram] = frequencies.get(bigram, 0) + 1 #write total number of words countVectorFile.write(str(frequencies.N())) #note, another good improvement, organize this for faster searching. for bigram in frequencies: countVectorFile.write(" ".join(bigram)+" "+str(frequencies[bigram]))
def get_feat_basic_text(self, text): from nltk import FreqDist import numpy as np try: tokens = nltk.word_tokenize(text) tags = nltk.pos_tag(tokens) set_tokens = set(tokens) n = len(tokens) F = FreqDist(tokens) Ftags = FreqDist(tags) k = 0 for st in set_tokens: freq_w = F.get(st) k += (freq_w * (np.math.log10(n) - np.math.log10(freq_w))) e = (1 / len(tokens)) * k # exclamations nr_exclamations = 0 nr_quotation_mark = 0 nr_comma = 0 nr_dot = 0 for s in tokens: if s == '!': nr_exclamations += 1 elif s == ',': nr_comma += 1 elif s == '.': nr_dot += 1 elif s == '?': nr_quotation_mark += 1 #nr_sent_pos = 0 #nr_sent_neg = 0 #nr_sent_neu = 0 sent_tokenize_list = nltk.sent_tokenize(text) #for s in sent_tokenize_list: # x = self.get_feat_sentiment(s) # if x > .5: # nr_sent_pos += 1 # elif .4 <= x <= .5: # nr_sent_neu += 1 # else: # nr_sent_neg += 1 pos = [['NN', 'NNP'], ['VB', 'VBN', 'VBG', 'VBD'], ['DT'], ['JJ'], ['RB']] freq_pos = [] for pp in pos: y = 0 for p in pp: try: y += Ftags.get(p) except: y += 0 freq_pos.append(y) ret = [ e, len(sent_tokenize_list), len(tokens), len(set_tokens), nr_exclamations, nr_quotation_mark, nr_comma, nr_dot ] ret.extend(freq_pos) # nr_sent_pos, nr_sent_neu, nr_sent_neg, return ret, False except Exception as e: config.logger.error(repr(e)) return MISSING_FEATURE * 13, True
aggfunc=np.sum) # pivot_by_freq = pivot_by_freq.reset_index() ret_axes: Axes = pivot_by_freq.plot(kind='barh') ret_axes.plot(pivot_by_freq['num_words'], list(ret_axes.get_yticks())) ret_axes.get_legend().remove() ret_axes.set_xlabel("Bin wise cumulative word numbers") ret_axes.set_title("Whole corpus") ret_axes.set_axisbelow(True) ret_axes.grid(True) plt.show() ########## Inspect by word length ############ len_counter = FreqDist() len_to_freq_dict = {} for key in vocab: len_counter[len(key)] = len_counter.get(len(key), 0) + 1 len_to_freq_dict[len(key)] = len_to_freq_dict.get(len(key), 0) + vocab[key] len_df = pd.DataFrame.from_records(list(len_counter.items()), columns=['word_length', 'num_words']) len_df.sort_values('word_length', inplace=True) len_df = len_df.merge(pd.DataFrame.from_records( list(len_to_freq_dict.items()), columns=['word_length', 'frequency']), on='word_length', how='inner', sort=True) ret_axes: Axes = len_df.plot(x='word_length', y='num_words', marker='o', color='blue',
class NGramModel: def __init__(self, n): self.n = n self.tokens_arr = [] self.freq_dist = FreqDist() def train(self, tokens_arr): self.tokens_arr = tokens_arr ngrams = self.get_ngrams() self.freq_dist = FreqDist(ngrams) def get_freq(self, ngram): if (self.freq_dist.get(ngram) is None): return 1 else: return self.freq_dist.get(ngram) + 1 def get_ngrams(self): unigrams = [] bigrams = [] trigrams = [] print_status("Creating n-grams...") j = 0 for sent in self.tokens_arr: words = list( pad_sequence(sent, pad_left=True, left_pad_symbol="<s>", pad_right=True, right_pad_symbol="</s>", n=self.n)) ngrams = list(everygrams(words, max_len=self.n)) for ngram in ngrams: if (len(ngram) == 1 and self.n == 2): unigrams.append(ngram) if (len(ngram) == 2 and self.n <= 3): bigrams.append(ngram) if j % (len(self.tokens_arr) / 10) == 0: print(f"token {j} of {len(self.tokens_arr)}") j += 1 return unigrams + bigrams + trigrams def load_ngrams_freq(self, freq_dist): self.freq_dist = freq_dist def get_word_log_prob(self, s, word_index): prob = 0 if (self.n == 2): if (word_index == 0): bigram = ('<s>', s[word_index]) unigram = ('<s>', ) else: bigram = (s[word_index - 1], s[word_index]) unigram = (s[word_index - 1], ) prob = self.get_freq(bigram) / self.get_freq(unigram) elif (self.n == 3): if (word_index == 0): trigram = ('<s>', '<s>', s[word_index]) bigram = ('<s>', '<s>') elif (word_index == 1): trigram = ('<s>', s[word_index - 1], s[word_index]) if len(s) >= 2 else ('<s>', s[word_index - 1], '</w>') bigram = ('<s>', s[word_index - 1]) else: trigram = (s[word_index - 2], s[word_index - 1], s[word_index]) bigram = (s[word_index - 2], s[word_index - 1]) prob = self.get_freq(trigram) / self.get_freq(bigram) return np.log(prob)
txt = fp.read() for i, sent in enumerate(sent_tokenize(txt)): for chunk in ne_chunk(pos_tag(word_tokenize(sent))): if hasattr(chunk, 'label') and chunk.label() == 'ORGANIZATION': org = ' '.join(c[0] for c in chunk) if org == phrase: print('Organization:', org, 'File:', file, 'Sentence Number:', i) print(sent) content.append(file) # print the location and sentensence if neccessary # if org == phrase: # print('Organization:', org, 'File:', file, 'Sentence Number:', i) # print(sent) fp.close() pd.DataFrame(content).to_csv('test.csv') # rank the frequency and get the 5 highest ones fdist = FreqDist(results) frequents = [tag for (tag, _) in fdist.most_common(10)] print(frequents) for frequent in frequents: print(frequent, fdist.get(frequent)) tag = [tag for (tag, _) in fdist.most_common()] num = [num for (_, num) in fdist.most_common()] out = pd.DataFrame() out['Organization'] = tag out['Counts'] = num out.to_csv('NLTK_ORG1.csv')
class LabelProducer(object): __TOTAL_ARTICLES = 5696797 # Number of articles in wikipedia. Used for tf-idf calculation __TERM_FREQUENCIES_DICT = { } # Term frequencies for words. Used for tf-idf calculation __EXTRACTS_PER_TERM = 20 # Number of document intros to process per search term pair __STOP_WORDS = {} # Words to filter out from results def __init__(self) -> None: super().__init__() nltk.download('stopwords') nltk.download('brown') self.__STOP_WORDS = stopwords.words('english') frequencies_file_name = '../data/frequency.pickle' self.__load_term_frequency_dict(frequencies_file_name) def __load_term_frequency_dict(self, frequencies_file_name): """ Create or load the frequency dictionary that is used in tf-idf calculation to speed up the calculation Args: The file name from which to load and save the frequency dict """ try: print('Loading word frequencies dictionary...') self.__TERM_FREQUENCIES_DICT = pickle.load( open(frequencies_file_name, 'rb')) except: print( 'Word frequencies dictionary doesn\'t exist; Creating it and saving it to file...' ) self.__TERM_FREQUENCIES_DICT = FreqDist(word.lower() for word in brown.words()) os.makedirs(os.path.dirname(frequencies_file_name), exist_ok=True) with open(frequencies_file_name, 'wb') as handle: pickle.dump(self.__TERM_FREQUENCIES_DICT, handle, protocol=pickle.HIGHEST_PROTOCOL) print('Finished loading word frequencies dictionary') def __get_wiki_page_list(self, term): """ Get a relevant page list for the given term Args: term: a string to use for querying wikipedia to get relevant documents Returns: A list of page titles related to the given term """ page_titles = [] page_titles = wikipedia.search(term)[:self.__EXTRACTS_PER_TERM] return page_titles def __get_pages(self, term): """ Query Wikipedia for the given term and return sentences that contain all of the words in the term Args: term: the term to query for Returns: List of sentences found that contain all the words """ titles = self.__get_wiki_page_list(term=term) result = [] if len(titles) > 0: res = requests.get( "https://en.wikipedia.org/w/api.php?format=json&action=query&prop=extracts&exlimit=max&exintro&explaintext&formatversion=2&titles=" + "|".join(titles)) term_tokens = term.split(' ') pages = res.json()['query']['pages'] for item in pages: try: page = item['extract'].lower() sentences = list( filter(lambda x: len(x) > len(term), page.split("."))) for sentence in sentences: contains_all = True for term_token in term_tokens: if term_token not in sentence: contains_all = False break if contains_all == True: result += [sentence] except Exception as e: print(e) return result def __get_term_frequency(self, term): """ Get the frequency of the given term Args: term: the term to get the frequency of Returns: The frequency of the term from the frequencies dict. 0 if it doesn't exist in th the dict """ return self.__TERM_FREQUENCIES_DICT.get(term, 0) def __get_tf_idf(self, term, count): """ Calculate the tf/idf of the given term Args: term: the term to calculate tf/idf of count: the term frequency Returns: The tf/idf value for the given term """ hit_docs = self.__get_term_frequency(term) if hit_docs == 0: return 0 idf = math.log(self.__TOTAL_ARTICLES / hit_docs) tf_idf = count * idf return tf_idf def __filter_words(self, text): """ Filter out words from the given text Args: text: the text to filter Returns: All the words in the text that are not in the stop words list, have length greater than 1 and are alphabetical characters """ return [ word for word in text if word not in self.__STOP_WORDS and len(word) > 1 and word.isalpha() ] def __tokenize(self, text): """ Tokenize the fiven text Args: text: the text to tokenize Returns: text after tokenization, conversion to lowercase and filtering of words using the filterWords function """ return self.__filter_words(nltk.word_tokenize(text)) def __term_counter(self, term): """ Count the occurrences of different words in the sentences that contain all the words of the given term Args: term: the term to query for Returns: A counter of occurrences of all the unfiltered words that returned from the given query """ res = self.__get_pages(term) result_words = [] for doc in res: result_words += self.__tokenize(doc.lower()) counter = Counter(result_words) return counter def calculate_most_probable_relations(self, terms, topn=20): """ Get the most probable labels for the given list of terms Args: terms: a list of 2-tuples which are terms to search labels for topn: number of labels to return Returns: A list of 'topn' labels which represent the relations between the words in the terms sorted in descending order of probablity """ term_str = {' '.join(pair) for pair in terms} terms_set = { word.lower() for term in term_str for word in term.split() } term_counters = [self.__term_counter(x) for x in term_str] all_terms = {k for d in term_counters for k in d.keys()} - terms_set all_dict = {} for term in all_terms: product = 1 for termc in term_counters: current_count = termc.get(term) if (current_count != None): product *= self.__get_tf_idf(term, current_count) all_dict[term] = product sorted_result = sorted(all_dict.items(), key=operator.itemgetter(1), reverse=True) result = list( map(lambda x: x[0], list(filter(lambda pair: pair[1] > 0, sorted_result))[:topn])) return result
class NGramModel: def __init__(self, n): self.n = n self.tokens_dict = dict() self.freq_dist = FreqDist() def train(self, tokens_dict): self.tokens_dict = tokens_dict ngrams = self.get_ngrams() self.freq_dist = FreqDist(ngrams) def get_freq(self, ngram): if (self.freq_dist.get(ngram) is None): return 1 else: return self.freq_dist.get(ngram) + 1 def get_ngrams(self): unigrams = [] bigrams = [] trigrams = [] fourgrams = [] fivegrams = [] sixgrams = [] print_status("Creating n-grams...") j = 0 for token in self.tokens_dict.keys(): if type(token) is float: print(f"ERROR : unknown token {token}") continue chars = list( pad_sequence(str(token), pad_left=True, left_pad_symbol="<w>", pad_right=True, right_pad_symbol="</w>", n=self.n)) ngrams = list(everygrams(chars, max_len=self.n)) for ngram in ngrams: if (len(ngram) == 1 and self.n == 2): for i in range(self.tokens_dict[token]): unigrams.append(ngram) if (len(ngram) == 2 and self.n <= 3): for i in range(self.tokens_dict[token]): bigrams.append(ngram) if (len(ngram) == 3 and self.n <= 4): for i in range(self.tokens_dict[token]): trigrams.append(ngram) if (len(ngram) == 4 and self.n <= 5): for i in range(self.tokens_dict[token]): fourgrams.append(ngram) if (len(ngram) == 5 and self.n <= 6): for i in range(self.tokens_dict[token]): fivegrams.append(ngram) if (len(ngram) == 6 and self.n <= 6): for i in range(self.tokens_dict[token]): sixgrams.append(ngram) if j % (len(self.tokens_dict) / 10) == 0: print(f"token {j} of {len(self.tokens_dict)}") j += 1 return unigrams + bigrams + trigrams + fourgrams + fivegrams + sixgrams def load_ngrams_freq(self, freq_dist): self.freq_dist = freq_dist def get_word_log_prob(self, word): word_log_prob = 0 if (self.n == 2): for i in range(len(word) + 1): if (i == 0): bigram = ('<w>', word[i]) unigram = ('<w>', ) elif (i == len(word)): bigram = (word[i - 1], ('</w>', )) unigram = (word[i - 1], ) else: bigram = (word[i - 1], word[i]) unigram = (word[i - 1], ) prob = self.get_freq(bigram) / self.get_freq(unigram) word_log_prob += np.log(prob) elif (self.n == 3): for i in range(len(word) + 2): if (i == 0): trigram = ('<w>', '<w>', word[i]) bigram = ('<w>', '<w>') elif (i == 1): trigram = ('<w>', word[i - 1], word[i]) if len(word) >= 2 else ('<w>', word[i - 1], '</w>') bigram = ('<w>', word[i - 1]) elif (i == len(word)): trigram = (word[i - 2], word[i - 1], '</w>') bigram = (word[i - 2], word[i - 1]) elif (i == len(word) + 1): trigram = (word[i - 2], '</w>', '</w>') bigram = (word[i - 2], '</w>') else: trigram = (word[i - 2], word[i - 1], word[i]) bigram = (word[i - 2], word[i - 1]) prob = self.get_freq(trigram) / self.get_freq(bigram) word_log_prob += np.log(prob) elif (self.n == 4): for i in range(len(word) + 3): if (i == 0): fourgram = ('<w>', '<w>', '<w>', word[i]) trigram = ('<w>', '<w>', '<w>') elif (i == 1): fourgram = ('<w>', '<w>', word[i - 1], word[i]) if len(word) >= 2 else ('<w>', '<w>', word[i - 1], '</w>') trigram = ('<w>', '<w>', word[i - 1]) elif (i == 2): if len(word) == 1: fourgram = ('<w>', word[i - 2], '</w>', '</w>') trigram = ('<w>', word[i - 2], '</w>') elif len(word) == 2: fourgram = ('<w>', word[i - 2], word[i - 1], '</w>') trigram = ('<w>', word[i - 2], word[i - 1]) else: fourgram = ('<w>', word[i - 2], word[i - 1], word[i]) trigram = ('<w>', word[i - 2], word[i - 1]) elif (i == len(word)): fourgram = (word[i - 3], word[i - 2], word[i - 1], '</w>') trigram = (word[i - 3], word[i - 2], word[i - 1]) elif (i == len(word) + 1): fourgram = (word[i - 3], word[i - 2], '</w>', '</w>') trigram = (word[i - 3], word[i - 2], '</w>') elif (i == len(word) + 2): fourgram = (word[i - 3], '</w>', '</w>', '</w>') trigram = (word[i - 3], '</w>', '</w>') else: fourgram = (word[i - 3], word[i - 2], word[i - 1], word[i]) trigram = (word[i - 3], word[i - 2], word[i - 1]) prob = self.get_freq(fourgram) / self.get_freq(trigram) word_log_prob += np.log(prob) elif (self.n == 5): for i in range(len(word) + 4): if (i == 0): fivegram = ('<w>', '<w>', '<w>', '<w>', word[i]) fourgram = ('<w>', '<w>', '<w>', '<w>') elif (i == 1): fivegram = ('<w>', '<w>', '<w>', word[i - 1], word[i]) if len(word) >= 2 else ('<w>', '<w>', '<w>', word[i - 1], '</w>') fourgram = ('<w>', '<w>', '<w>', word[i - 1]) elif (i == 2): if len(word) == 1: fivegram = ('<w>', '<w>', word[i - 2], '</w>', '</w>') fourgram = ('<w>', '<w>', word[i - 2], '</w>') elif len(word) == 2: fivegram = ('<w>', '<w>', word[i - 2], word[i - 1], '</w>') fourgram = ('<w>', '<w>', word[i - 2], word[i - 1]) else: fivegram = ('<w>', '<w>', word[i - 2], word[i - 1], word[i]) fourgram = ('<w>', '<w>', word[i - 2], word[i - 1]) elif (i == 3): if len(word) == 1: fivegram = ('<w>', word[i - 3], '</w>', '</w>', '</w>') fourgram = ('<w>', word[i - 3], '</w>', '</w>') elif len(word) == 2: fivegram = ('<w>', word[i - 3], word[i - 2], '</w>', '</w>') fourgram = ('<w>', word[i - 3], word[i - 2], '</w>') elif len(word) == 3: fivegram = ('<w>', word[i - 3], word[i - 2], word[i - 1], '</w>') fourgram = ('<w>', word[i - 3], word[i - 2], word[i - 1]) else: fivegram = ('<w>', word[i - 3], word[i - 2], word[i - 1], word[i]) fourgram = ('<w>', word[i - 3], word[i - 2], word[i - 1]) elif (i == len(word)): fivegram = (word[i - 4], word[i - 3], word[i - 2], word[i - 1], '</w>') fourgram = (word[i - 4], word[i - 3], word[i - 2], word[i - 1]) elif (i == len(word) + 1): fivegram = (word[i - 4], word[i - 3], word[i - 2], '</w>', '</w>') fourgram = (word[i - 4], word[i - 3], word[i - 2], '</w>') elif (i == len(word) + 2): fivegram = (word[i - 4], word[i - 3], '</w>', '</w>', '</w>') fourgram = (word[i - 4], word[i - 3], '</w>', '</w>') elif (i == len(word) + 3): fivegram = (word[i - 4], '</w>', '</w>', '</w>', '</w>') fourgram = (word[i - 4], '</w>', '</w>', '</w>') else: fivegram = (word[i - 4], word[i - 3], word[i - 2], word[i - 1], word[i]) fourgram = (word[i - 4], word[i - 3], word[i - 2], word[i - 1]) prob = self.get_freq(fivegram) / self.get_freq(fourgram) word_log_prob += np.log(prob) elif (self.n == 6): for i in range(len(word) + 5): if (i == 0): sixgram = ('<w>', '<w>', '<w>', '<w>', '<w>', word[i]) fivegram = ('<w>', '<w>', '<w>', '<w>', '<w>') elif (i == 1): sixgram = ('<w>', '<w>', '<w>', '<w>', word[i - 1], word[i]) if len(word) >= 2 else ('<w>', '<w>', '<w>', '<w>', word[i - 1], '</w>') fivegram = ('<w>', '<w>', '<w>', '<w>', word[i - 1]) elif (i == 2): if len(word) == 1: sixgram = ('<w>', '<w>', '<w>', word[i - 2], '</w>', '</w>') fivegram = ('<w>', '<w>', '<w>', word[i - 2], '</w>') elif len(word) == 2: sixgram = ('<w>', '<w>', '<w>', word[i - 2], word[i - 1], '</w>') fivegram = ('<w>', '<w>', '<w>', word[i - 2], word[i - 1]) else: sixgram = ('<w>', '<w>', '<w>', word[i - 2], word[i - 1], word[i]) fivegram = ('<w>', '<w>', '<w>', word[i - 2], word[i - 1]) elif (i == 3): if len(word) == 1: sixgram = ('<w>', '<w>', word[i - 3], '</w>', '</w>', '</w>') fivegram = ('<w>', '<w>', word[i - 3], '</w>', '</w>') elif len(word) == 2: sixgram = ('<w>', '<w>', word[i - 3], word[i - 2], '</w>', '</w>') fivegram = ('<w>', '<w>', word[i - 3], word[i - 2], '</w>') elif len(word) == 3: sixgram = ('<w>', '<w>', word[i - 3], word[i - 2], word[i - 1], '</w>') fivegram = ('<w>', '<w>', word[i - 3], word[i - 2], word[i - 1]) else: sixgram = ('<w>', '<w>', word[i - 3], word[i - 2], word[i - 1], word[i]) fivegram = ('<w>', '<w>', word[i - 3], word[i - 2], word[i - 1]) elif (i == 4): if len(word) == 1: sixgram = ('<w>', word[i - 4], '</w>', '</w>', '</w>', '</w>') fivegram = ('<w>', word[i - 4], '</w>', '</w>', '</w>') elif len(word) == 2: sixgram = ('<w>', word[i - 4], word[i - 3], '</w>', '</w>', '</w>') fivegram = ('<w>', word[i - 4], word[i - 3], '</w>', '</w>') elif len(word) == 3: sixgram = ('<w>', word[i - 4], word[i - 3], word[i - 2], '</w>', '</w>') fivegram = ('<w>', word[i - 4], word[i - 3], word[i - 2], '</w>') elif len(word) == 4: sixgram = ('<w>', word[i - 4], word[i - 3], word[i - 2], word[i - 1], '</w>') fivegram = ('<w>', word[i - 4], word[i - 3], word[i - 2], word[i - 1]) else: sixgram = ('<w>', word[i - 4], word[i - 3], word[i - 2], word[i - 1], word[i]) fivegram = ('<w>', word[i - 4], word[i - 3], word[i - 2], word[i - 1]) elif (i == len(word)): sixgram = (word[i - 5], word[i - 4], word[i - 3], word[i - 2], word[i - 1], '</w>') fivegram = (word[i - 5], word[i - 4], word[i - 3], word[i - 2], word[i - 1]) elif (i == len(word) + 1): sixgram = (word[i - 5], word[i - 4], word[i - 3], word[i - 2], '</w>', '</w>') fivegram = (word[i - 5], word[i - 4], word[i - 3], word[i - 2], '</w>') elif (i == len(word) + 2): sixgram = (word[i - 5], word[i - 4], word[i - 3], '</w>', '</w>', '</w>') fivegram = (word[i - 5], word[i - 4], word[i - 3], '</w>', '</w>') elif (i == len(word) + 3): sixgram = (word[i - 5], word[i - 4], '</w>', '</w>', '</w>', '</w>') fivegram = (word[i - 5], word[i - 4], '</w>', '</w>', '</w>') elif (i == len(word) + 4): sixgram = (word[i - 5], '</w>', '</w>', '</w>', '</w>', '</w>') fivegram = (word[i - 5], '</w>', '</w>', '</w>', '</w>') else: sixgram = (word[i - 5], word[i - 4], word[i - 3], word[i - 2], word[i - 1], word[i]) fivegram = (word[i - 5], word[i - 4], word[i - 3], word[i - 2], word[i - 1]) prob = self.get_freq(sixgram) / self.get_freq(fivegram) word_log_prob += np.log(prob) return word_log_prob
def doc_features(doc): doc_words = FreqDist(w for w in doc if not isStopWord(w)) features = {} for word in word_features: features['count (%s)' % word] = (doc_words.get(word, 0)) return features
def doc_features(doc): doc_words = FreqDist(w for w in doc if not isStopWord(w)) features = {} for word in word_features: features['count (%s)' % word] = (doc_words.get(word, 0)) return features
def remove_low_freq_phrases(keyPhrases, cutOff): """ Purpose: To remove low frequency key phrases. These low frequency key phrases can muddy up the final results. Also, removes duplicate key phrases in the SAME comment so that 1 customer saying the SAME THING over and over doesn't skew the results in their favor. Args: - A list of strings, such as key phrases. Each list entry contains key phrases for 1 comment separated by a comma. Note, a key phrase can be 1-4 words long which will be taken into account below. - Cutoff - an integer representing the frequency. If any key phrases occur below that cut-off they will be removed. Returns: A list of strings (key phrases) formatted the same as the input. Raises: Nothing. Example: keyPhrases[0] = 'gift card, restaurant week photo competition, two tables' In other words, 1 customer comment had all of the key phrases above. cutOff = 2. In this example, "gift card" must appear in ALL key phrases at least 2 times or else if will be deleted from the return list. """ freqWords = [] bagOfphrases = [] for entry in keyPhrases: tmpList = [] # Separate the comma separated word list into individual list entries. # This can result in key phrases that are 1 to 4 words long. strList = entry.split(sep = ',') # Ensure each key phrase is NOT repeated in the SAME comment. # Don't want 1 comment to have the same key phrase multiple times # skewing the frequency. Also, don't want to include single characters # for consideration. for strEntry in strList: if strEntry not in tmpList and len(strEntry) > 1: bagOfphrases.append(strEntry) tmpList.append(strEntry) # Use FreqDist to count the number of times each phrase occurs. fDist = FreqDist(bagOfphrases) # Loop through each entry in the keyPhrases. Get the frequency for that # key phrase. If it's above the cut-off then keep it, if not # remove it. for entry in keyPhrases: tmpList = [] results = "" # Separate the comma separated word list into individual list entries. # This can result in key phrases that are 1 to 4 words long. strList = entry.split(sep = ',') # For each key phrase if it's equal to or above the frequency cut off # then save it. for strEntry in strList: if strEntry is not None: # Need to get the frequency first since a get() will return None # if the .get(key) is not found. Also ensure we don't duplicate # key phrases for the same comment. frequency = fDist.get(strEntry) if frequency is not None: if strEntry not in tmpList and frequency >= cutOff: results = results + ', ' + strEntry tmpList.append(strEntry) # Remove the leading comma. if len(results) > 2: results = results[2:] results = results.strip() freqWords.append(results) return(freqWords)
# print possibility of word wordPos = [(x, dataRaw_pdist.prob(x)) for x in vocabRaw_tokens_nopunct] # print possibility of UNK KPos = 0 for y in vocabRaw_tokens_nopunct: KPos += dataRaw_pdist.prob(y) UNKPos = [('UNK', (1 - KPos))] wordPos.append(UNKPos[0]) #print(wordPos) #print('UNK, ',UNKPos) # after smoothing _Pa = (dataRaw_fdist.get('a') + 1) / (len(dataRaw_tokens_nopunct) + len(vocabRaw_tokens_nopunct) + 1) _Pb = (dataRaw_fdist.get('b') + 1) / (len(dataRaw_tokens_nopunct) + len(vocabRaw_tokens_nopunct) + 1) _Pc = (dataRaw_fdist.get('c') + 1) / (len(dataRaw_tokens_nopunct) + len(vocabRaw_tokens_nopunct) + 1) _Punk = 1 - _Pa - _Pb - _Pc _Pa, _Pb, _Pc, _Punk # calculate the bi_gram s1 = '<s>' s2 = '</s>' Know = vocabRaw_tokens_nopunct[0] + vocabRaw_tokens_nopunct[ 1] + vocabRaw_tokens_nopunct[2] vocabRaw_tokens_nopunct.append("[^" + Know + "]") vocabRaw_tokens_nopunct.append(s1)
class Text(): ''' >>> source = os.path.abspath(r"..\CORPUS\en") >>> sents = Text(Stream(Path(source,"*.txt"))) >>> for sent in sents: print(sent) ''' def __init__( self, intake, prep: Prep = None, clean: TextCleaner = None, filters: TokenFilter = None, inplace=False, datadir=None, encoding=chardetector, verbose=True, rewrite=False, loadas="pickle", saveas=("txt", "pickle"), input='filename' # str {'filename', 'file', 'text'} ): self._path = '' self.filename = '' self.name = '' self.inplace = inplace self.verbose = verbose self.rewrite = rewrite self.loadas = loadas self.saveas = saveas self.encoding = 'unknown' if not datadir: self.datadir = os.path.join( os.path.abspath(os.path.dirname(__file__)), "data") else: self.datadir = os.path.abspath(datadir) self._encoding = None self._nwords = 0 self._sents = [] self._vocab = FreqDist() self._trie = dawg.RecordDAWG(">IH") if input == 'filename': self._path = intake #self.filename = os.path.basename(os.path.splitext(self._path)[0]) self.filename = os.path.basename(self._path) self.name = os.path.splitext(self.filename)[0] self._encoding = encoding self._input = intake if not self.rewrite: if self.loadas == 'pickle': self._sents = self.loadpickle('sents') or [ ] # all sentences from the text self._vocab = self.loadpickle('vocab') or FreqDist( ) # all unique normalized words from the text # итеративная загрузка словаря идет несколько секунд идет, поэтому быстрее # (за доли секунды) прочитать его из pickle #for sent in self._sents: # self._vocab += FreqDist(sent.lemmas()) self._trie = self.loaddawg('trie') or dawg.RecordDAWG( ">IH") # prefix tree elif input == "text": self._input = io.StringIO(intake) self._path = '' self.filename = self._input.__class__.__name__ self.name = self.filename elif input == "file": self._input = intake self._path = '' self.filename = self._input.__class__.__name__ self.name = self.filename if self._sents: self._nwords = sum(map(lambda s: s.nwords, self._sents)) self._prep = prep self._clean = clean self._filters = filters self._iter = self.__sents__() # close the generator if data is loaded if self._sents: self._iter.close() if self.inplace: if not self._sents: list(self._iter) def __sents__(self): encoding = self._encoding sentencizer = self._prep.sentencizer clean = self._clean path = self._input if self.loadas == 'txt' and self._path: path = datapath(self._path, datadir=self.datadir, ext=".tagged.txt").full if os.path.exists(path): encoding = 'utf-8' sentencizer = None clean = None stream = Stream(path, encoding=encoding) self.encoding = stream._encoding for num, sent in enumerate(stream(sentencizer, clean)): tagged_sent = TaggedSentence(sent.strip(), num, self._prep, self._filters) lemmas = tagged_sent.lemmas() # в этот словарь попадают все леммы, # так как здесь ничего не фильтруется self._vocab += FreqDist(lemmas) self._nwords += tagged_sent.nwords self._sents.append(tagged_sent) #self._words.extend(tagged_sent.words()) yield tagged_sent data = ((token.word, (token.nsent, token.idx)) for sent in self.sents() for token in sent.tokens(lower=True)) self._trie = dawg.RecordDAWG(">IH", data) @property def embed(self): '''Доступ к Word2Vec для создания embedding''' return word2vec() @property def trie(self): '''Доступ к префиксному дереву текста''' return Trie(self._trie) """ def trie(self, key=None, sort=True): '''Доступ к префиксному дереву текста''' if key is None:return self._trie res = self._trie.get(key,[]) if sort: res.sort(key=lambda t:(t[0],[1])) return res def startswith(self, affix): '''Поиск по префиксному дереву слов, которые начинаются с указанного префикса''' return self._trie.keys(affix) @property def occur(self): return self._trie """ @property def vocab(self): '''Доступ к вокабуляру лемм текста''' return self._vocab @property def nsents(self): '''Число предложений в тексте''' return len(self._sents) @property def nwords(self): '''Число слов в тексте''' return self._nwords @property def nlemmas(self): '''Число лемм в тексте''' return len(self._vocab) def _iter_by_sents(self, attr, filtrate=False, lower=False, pos=None): for sent in self.sents(): tokens = sent.tokens(filtrate=filtrate, lower=lower, pos=pos) sent = [] for token in tokens: sent.append(getattr(token, attr)) yield sent def sents2words(self, filtrate=False, lower=False, pos=None): result = [ sent for sent in self._iter_by_sents( attr="word", filtrate=filtrate, lower=lower, pos=pos) ] return result def sents2lemmas(self, filtrate=False, lower=False, pos=None): result = [ sent for sent in self._iter_by_sents( attr="lemma", filtrate=filtrate, lower=lower, pos=pos) ] return result def sents(self, n_sent=None, max_words=None, min_words=None): if n_sent is not None: res = self._sents[n_sent] else: if min_words is not None and max_words is not None: res = [ sent for sent in self._sents if min_words <= sent.nwords <= max_words ] else: if max_words is not None: res = [ sent for sent in self._sents if sent.nwords <= max_words ] elif min_words is not None: res = [ sent for sent in self._sents if sent.nwords >= min_words ] else: res = self._sents return res def iterwords(self, filtrate=False, lower=True, pos=None): for sent in self._sents: tokens = sent.tokens(filtrate=filtrate, lower=lower, pos=pos) for token in tokens: yield token.word def iterlemmas(self, filtrate=False, lower=True, pos=None): for sent in self._sents: tokens = sent.tokens(filtrate=filtrate, lower=lower, pos=pos) for token in tokens: yield token.lemma # TODO: переделать на извлечение из trie def words(self, filtrate=False, lower=True, pos=None, uniq=False): result = [ word for word in self.iterwords(filtrate=filtrate, lower=lower, pos=pos) ] if uniq: result = list(set(result)) return result # TODO: переделать на извлечение из trie def lemmas(self, filtrate=False, lower=True, pos=None, uniq=False): result = [ lemma for lemma in self.iterlemmas( filtrate=filtrate, lower=lower, pos=pos) ] if uniq: result = list(set(result)) return result def tags(self): result = [] for sent in self._sents: result.extend(sent.tags) return result def postags(self, pos=None, sort=False, top=0, universal_tagset=False, ret_cond=False): '''Создает частотные словари или отсортированные по частоте списки частей речи''' def merge(tags): result = FreqDist() for tag in tags: result += cfd[tag] return result maps = { 'NOUN': {'NN', 'NNS', 'NNP', 'NNPS'}, 'VERB': {'VB', 'VBD', 'VBG', 'VBN', 'VBP', 'VBZ'}, 'ADJ': {'JJ', 'JJR', 'JJS'}, 'ADV': {'RB', 'RBR' 'RBS'}, } cfd = ConditionalFreqDist() for sent in self._sents: #tokens = sent.untagging() tokens = sent.tags for tok, tag, lemma in tokens: cfd[tag][lemma.lower()] += 1 cond = cfd.conditions() result = cfd if pos: if not universal_tagset and pos in maps: result = merge(maps[pos]) else: result = cfd[pos] if top: result = _top(result, top) else: result = _sort(result, sort) if ret_cond: result = result, cond return result ''' def postags2(self, pos=None): words = [] for sent in self._sents: words.extend(sent.untagging()) words.sort() tags = defaultdict(list) for key, group in groupby(words, lambda make: make[1]): tags[key].extend([l for t,p,l in group]) if pos: return tags.get(pos) return tags ''' def suffix(self, affix): '''Поиск по суффиксному дереву''' pass def stats(self): '''Всевозможная статистика по тексту''' pass def count(self, token=None, words=True, uniq=False, lower=True): if words: if token: # общее число вхождений слова в текст result = len(self._trie.get(token, 0)) return result #----------------------------------- # число уникальных слов if uniq: # вот здесь придется сначала получить все слова # и только потом узнать сколько их по одному вхождению, # так как эта информация нигде не хранится result = len(self.words(uniq=True, lower=lower)) # общее число вхождений всех словоформ else: result = self.nwords #-------------------------------- else: # по леммам if token: # общее число вхождений леммы в текст result = self._vocab.get(token, 0) return result # число уникальных лемм if uniq: result = len(self._vocab) # общее число вхождений всех лемм else: result = sum(self._vocab.values()) return result def keywords( self, by='words', rating=('rake', dict(max_words=4, stopwords=nltk.corpus.stopwords.words('english')))): sents = [] for sent in self._sents: tokens = sent.words() if by == 'words' else sent.lemmas() sents.append(tokens) if rating[0] == 'rake': rake = Rake(sents, **rating[1]) result = rake elif rating[0] == 'textrank': # нереализовано, так как используемый класс TextRank # создает оценки только для предложений pass return result # на построение графа в TextRank уходит много памяти для больших текстов # (> 20 тысяч словоупотреблений)! def summarize(self, top=7, scores=True): words = [set(sent.lemmas(uniq=True)) for sent in self.sents()] textrank = TextRank(words, self.nsents) if top: result = textrank.topn(n=top) if scores: result = [(score, self._sents[idx].raw) for idx, score in result] else: result = [self._sents[idx].raw for idx, score in result] else: result = textrank return result def doc2bow(self): pass def ngrams(self, n, words=False, filtrate=False, lower=True, **kwargs): method = self.words if words else self.lemmas yield from nltk.ngrams(method(filtrate=filtrate, lower=lower), n, **kwargs) def skipgrams(self, n, k, words=False, filtrate=False, lower=True, **kwargs): method = self.words if words else self.lemmas yield from nltk.skipgrams(method(filtrate=filtrate, lower=lower), n, k, **kwargs) def collocations(self): pass def hapaxes(self, words=False, filtrate=False): '''Метод извлекающий из текста слова-одиночки''' if not words: # ищем в леммах res = self._vocab else: res = FreqDist(self.words(filtrate=filtrate)) return res.hapaxes() @property def speller(self): return self._prep.speller() def _validpath(self, path): return os.path.exists(path) def loadpickle(self, name, path=None): path_ = path or datapath(self._path, datadir=self.datadir).short path = '{}.{}.pickle'.format(path_, name) if self._validpath(path): if self.verbose: print('loading pickle:'.ljust(16), path.replace(nlptk.MODULEDIR, '..')) with open(path, 'rb') as f: obj = pickle.load(f) else: obj = None return obj def loaddawg(self, name, path=None): path_ = path or datapath(self._path, datadir=self.datadir).short path = '{}.{}.dawg'.format(path_, name) if self._validpath(path): if self.verbose: print('loading dawg:'.ljust(16), path.replace(nlptk.MODULEDIR, '..')) d = dawg.RecordDAWG(">IH") obj = d.load(path) else: obj = None return obj def savedawg(self, name, path=None): path_ = path or datapath(self._path, datadir=self.datadir).short # сохранение словаря для префиксного дерева path = '{}.{}.dawg'.format(path_, name) if self.verbose: print('saving dawg:'.ljust(16), path.replace(nlptk.MODULEDIR, '..')) self._trie.save(path) def save(self, path=None, as_=("txt", "pickle")): if not os.path.exists(self.datadir): os.mkdir(self.datadir) path_ = path or datapath(self._path, datadir=self.datadir).short saveas = self.saveas or as_ if not isinstance(saveas, (tuple, list)): saveas = (saveas, ) for fmt in saveas: if fmt == "txt": path = '{}.tagged.txt'.format(path_) if self.verbose: print('saving txt:'.ljust(16), path.replace(nlptk.MODULEDIR, '..')) with open(path, 'w', encoding='utf8') as f: f.writelines('\n'.join(map(str, self._sents))) elif fmt == 'pickle': path = '{}.sents.pickle'.format(path_) if self.verbose: print('saving pickle:'.ljust(16), path.replace(nlptk.MODULEDIR, '..')) with open(path, 'wb') as f: pickle.dump(self._sents, f) path = '{}.vocab.pickle'.format(path_) with open(path, 'wb') as f: pickle.dump(self._vocab, f) self.savedawg('trie', path_) def __iter__(self): return self._iter def __next__(self): return next(self._iter) def __str__(self): return '\n'.join([str(sent) for sent in self.sents()]) def __repr__(self): fmt = ("Text(\n\tname='{}',\n\tencoding='{}',\n\t" "nsents={},\n\tnwords={},\n\tnlemmas={}\n)") return fmt.format(self.name, self.encoding, self.nsents, self.nwords, self.nlemmas)