Example #1
0
def find_abbreviations():
    import db
    from tokenizers import es
    from nltk import FreqDist

    corpus = db.connect()
    #text = '\n'.join([a['text'] for a in corpus.articles.find().limit(10)])
    text = '\n'.join([a['text'] for a in corpus.articles.find()])
    tokens = es.tokenize(text, ignore_abbreviations=True)

    fd = FreqDist()
    fd_abbr = FreqDist()
    fd_n_abbr = FreqDist()
    n_tokens = len(tokens)
    for i in range(n_tokens):
        fd.inc(tokens[i])
        if i < (n_tokens - 1) and tokens[i + 1] == u'.':
            fd_abbr.inc(tokens[i])
        else:
            fd_n_abbr.inc(tokens[i])

    adjusted = {}
    f_avg = len(fd.keys()) / fd.N()
    for t, n in fd_abbr.iteritems():
        f = fd.get(t, 0) / fd.N()
        deviation = 1 + (f - f_avg)
        adjusted[t] = n * deviation / fd_n_abbr.get(t, 1) / len(t)

    items = adjusted.items()
    items.sort(key=lambda i: i[1], reverse=True)
    for t, n in items[:100]:
        print u'%s. %f (%d, %d)' % (t, n, fd_abbr[t], fd_n_abbr.get(t, 0))
Example #2
0
    def work_1():
        file_string = ""
        txt_file = open("trabalho1.txt", "r+")
        csv_file = open("trabalho1.csv", "w+")
        csv_manage = csv.writer(csv_file,
                                delimiter=";",
                                quoting=csv.QUOTE_MINIMAL)
        base_text = txt_file.read()
        sentences = word_tokenize(base_text)
        frequency = FreqDist(sentences)

        print("texto : {0}".format(base_text))

        print("Total de palavras : {0}".format(frequency.N()))
        print("Total de Termos : {0}".format(len(frequency.keys())))
        print("")

        print("Tabela de Frequência de Termos")
        print("")

        for key in frequency.keys():
            csv_manage.writerow([key, str(frequency.get(key))])
            print("Termo: {0}  Total: {1}".format(key,
                                                  str(frequency.get(key))))

        pdfOutput = PdfOutput(frequency, frequency.N(), len(frequency.keys()),
                              base_text)
        servicePdfManager = ServiceManagerPdf()
        servicePdfManager.writePdf(pdfOutput)

        txt_file.close()
        csv_file.close()
def calculate_nb_probabilities():

    ## GOAL: Populate these two dicts, where each
    ##      key = word from poswords or negwords (created for you above)
    ##      value = NB probability for that word in that class (calculated by you here)

    poswordprobs = {}
    negwordprobs = {}

    #########################################
    ##### YOUR PART B CODE STARTS HERE ######
    #########################################

    ## Create a FreqDist for poswords below.
    freqDistPos = FreqDist(poswords)
    countpos = len(freqDistPos)
    ## Create a FreqDist for negwords below.
    freqDistNeg = FreqDist(negwords)
    countneg = len(freqDistNeg)
    ## Loop through your poswords FreqDist, and calculate the
    ## probability of each word in the positive class, like this:
    ## P(word|pos) = count(word) / total number of positive tokens
    ## where count(word) is what you get from the FreqDist for poswords.
    ## Store the results in poswordprobs.
    ## USE LOGS!!!

    for word in freqDistPos:
        value = freqDistPos.get(word)
        prob = math.log(value / countpos)
        poswordprobs[word] = prob

    ## Now, loop through your negwords FreqDist, and calculate the
    ## probability of each word in the negative class, like this:
    ## P(word|neg) = count(word) / total number of negative tokens
    ## where count(word) is what you get from the FreqDist for negwords.
    ## Store the results in negwordprobs.
    ## USE LOGS!!!

    for word in freqDistNeg:
        value = freqDistNeg.get(word)
        prob = math.log(value / countneg)
        negwordprobs[word] = prob

    #########################################
    ##### YOUR PART B CODE ENDS HERE ########
    #########################################

    return (poswordprobs, negwordprobs)
Example #4
0
def featureset(sample):
  comment, label = sample
  features = {}
#  tags = map(lambda statement: map(lambda (w,t):t, statement), comment)
  words = map(lambda statement: map(lambda (w,t):w, statement), comment)
  words = sum(words, [])
#  tags = sum(tags, [])
  size_= sum([len(word) for word in words])
  features['stmt_len'] = len(words)/float(len(comment))
  features['word_len'] = size_/float(len(words))
  features['size'] = size_
#  tags_dist = FreqDist(sum(tags, []))
#  for tag in TAGS:
#    features[tag] = tags_dist.get(tag, 0)
  dist = FreqDist([word.lower() for word in words])
#  num_stop_words = float(sum([dist.get(word, 0) for word in EN_STOPWORDS]))
#  features['prob_stop_words'] = num_stop_words/len(words)
  for word in EN_STOPWORDS:
    features[word] = dist.get(word, 0)/float(len(words))
  features['alwayson'] = 1.0
  for language in LANGUAGES:
    for i in range(1,n+1):
      word_sim, tag_sim, char_sim, w_s_sim = comment_similarity(GRAMS[language], comment, i)
      features['w_sim_%d_%s' % (i, language)] = word_sim
      features['t_sim_%d_%s' % (i, language)] = tag_sim
      features['c_sim_%d_%s' % (i, language)] = char_sim
#     features['s_sim_%d_%s' % (i, language)] = w_s_sim
  return (features, label)
Example #5
0
def featureset(sample):
    comment, label = sample
    features = {}
    #  tags = map(lambda statement: map(lambda (w,t):t, statement), comment)
    words = map(lambda statement: map(lambda (w, t): w, statement), comment)
    words = sum(words, [])
    #  tags = sum(tags, [])
    size_ = sum([len(word) for word in words])
    features['stmt_len'] = len(words) / float(len(comment))
    features['word_len'] = size_ / float(len(words))
    features['size'] = size_
    #  tags_dist = FreqDist(sum(tags, []))
    #  for tag in TAGS:
    #    features[tag] = tags_dist.get(tag, 0)
    dist = FreqDist([word.lower() for word in words])
    #  num_stop_words = float(sum([dist.get(word, 0) for word in EN_STOPWORDS]))
    #  features['prob_stop_words'] = num_stop_words/len(words)
    for word in EN_STOPWORDS:
        features[word] = dist.get(word, 0) / float(len(words))
    features['alwayson'] = 1.0
    for language in LANGUAGES:
        for i in range(1, n + 1):
            word_sim, tag_sim, char_sim, w_s_sim = comment_similarity(
                GRAMS[language], comment, i)
            features['w_sim_%d_%s' % (i, language)] = word_sim
            features['t_sim_%d_%s' % (i, language)] = tag_sim
            features['c_sim_%d_%s' % (i, language)] = char_sim


#     features['s_sim_%d_%s' % (i, language)] = w_s_sim
    return (features, label)
Example #6
0
def check_marks(a,b):
    from nltk import ConditionalFreqDist, FreqDist
    s = a.split()
    c=0
    fd = FreqDist(s)
    for i in range(0,len(b)):
        if fd.get(b[i]) != None : 
            c=c+0.5
    return c
Example #7
0
def diccionario_bigramPalabras():
    # Lectura y transformación de Corpus
    corpus = PlaintextCorpusReader("Corpus", '.*')
    tokenizer = RegexpTokenizer(r'[a-zA-Záéíóúñ]+')
    tokens = tokenizer.tokenize(corpus.raw())
    
    # Generación diccionario bigram de palabras + frecuencia
    bigrams_orig = bigrams(tokens)
    fdist = FreqDist(bigrams_orig)
    dict_bigrams = {}
    for b in fdist:
        b_tr = (b[0], traducciones.traduce_numerico(b[1]))
        try:
            if dict_bigrams[b_tr][1] < fdist.get(b):
                dict_bigrams[b_tr] = [b, fdist.get(b)]
        except:
            dict_bigrams[b_tr] = [b, fdist.get(b)]

    return dict_bigrams
Example #8
0
def get_uni(first, second, uni):
    bigramfdist = FreqDist()
    for line in first:
        token = nltk.word_tokenize(line)
        token = [
            x for x in token
            if not re.fullmatch('[' + string.punctuation + ']+', x)
        ]
        bigrams = ngrams(token, 1)
        bigramfdist.update(bigrams)

    print(bigramfdist.most_common(50))
    print(bigramfdist.get("but"))
Example #9
0
def collectFreqData(file_name, initalkeys=[]):
    #inital keys for use in tf-idf.
    if (initalkeys is not None and len(initalkeys) > 0):
        fqdist = initalizeFreqDistWithKeys(initalkeys)
    else:
        fqdist = FreqDist()

    if not path.exists(file_name):
        print("no file or no data for: " + file_name)
        return FreqDist()

    with open(file_name, "r") as current_file:
        for line in current_file:
            for word in line.split():
                fqdist[word] = fqdist.get(word, 0) + 1

    fqdist["<end_comment>"] = 0

    for word in initalkeys:
        if (fqdist.get(word, 0) == 0):
            fqdist[word] = 0

    return fqdist
def calculate_smooth_nb_probabilities():

    smooth_poswordprobs = {}
    smooth_negwordprobs = {}

    #########################################
    ##### YOUR PART C CODE STARTS HERE ######
    #########################################

    # Populate the above dictionaries just as you did in the unsmoothed
    # version, but use +1 smoothing so that you can handle unseen words.
    freqDistPosSmooth = FreqDist(poswords)
    freqDistNegSmooth = FreqDist(negwords)
    # +1 smoothing: when calculating the probabilities,
    # add 1 to every count found in the FreqDist for each class.
    # Divide the count by the number of types...
    #     *plus* the number of tokens for that class...
    #     *plus* 1 (for the count of the unseen word)

    # Don't forget to use logs.
    typesP = len(set(poswords))
    tokensP = len(poswords)
    typesN = len(set(negwords))
    tokensN = len(negwords)
    for word in freqDistPosSmooth:
        valueSmooth = freqDistPosSmooth.get(word)
        countposSmooth = valueSmooth + 1
        probSmooth = math.log(countposSmooth / (typesP + tokensP + 1))
        smooth_poswordprobs[word] = probSmooth
    for word in freqDistNegSmooth:
        valueSmooth = freqDistNegSmooth.get(word)
        countnegSmooth = valueSmooth + 1
        probSmooth = math.log(countnegSmooth / (typesN + tokensN + 1))
        smooth_negwordprobs[word] = probSmooth

    return (smooth_poswordprobs, smooth_negwordprobs)
def transfer(fileDj,vocabulary):
    fo=open(fileDj,"r")
    content=fo.read()
    tokens=nltk.word_tokenize(content)
    # st=[SBStemmer.stem(t) for t in tokens]
    st=tokens
    fo.close()

    fdist=FreqDist(st)
    BOWDj = []
    for key in vocabulary:
        if key in fdist.keys():
            BOWDj.append(fdist.get(key))
        else:
            BOWDj.append(0)
    return BOWDj
Example #12
0
class Article:
    def __init__(self, body, category='Unknown'):
        self.body = body
        self.category = category

        self.words = word_tokenize(self.body)
        self.sentences = sent_tokenize(self.body)

        self.word_tags = pos_tag(self.words)
        self.frequencies = FreqDist([i[1] for i in self.word_tags])
        self.reduced_frequencies = {}
        self.features_reduction()

    def get_frequency(self, feature_name):
        try:
            return int(self.frequencies.get(feature_name))
        except TypeError:
            return 0

    def features_reduction(self):
        self.reduced_frequencies['adjectives'] = self.get_frequency('JJ') + self.get_frequency(
            'JJR') + self.get_frequency('JJS')
        self.reduced_frequencies['adverbs'] = self.get_frequency('RB') + self.get_frequency('RBR') + self.get_frequency(
            'RBS')
        self.reduced_frequencies['articles'] = self.get_frequency('DT')
        self.reduced_frequencies['conjunctions'] = self.get_frequency('CC')
        self.reduced_frequencies['interjections'] = self.get_frequency('UH')
        self.reduced_frequencies['nouns'] = self.get_frequency('NN') + self.get_frequency('NNS') + self.get_frequency(
            'NNP') + self.get_frequency('NNPS')
        self.reduced_frequencies['numerals'] = self.get_frequency('CD')
        self.reduced_frequencies['past_part'] = self.get_frequency('VBN')
        self.reduced_frequencies['prepositions'] = self.get_frequency('IN')
        self.reduced_frequencies['pronouns'] = self.get_frequency('PRP') + self.get_frequency('PRP$')
        self.reduced_frequencies['punctuation'] = self.get_frequency('``') + self.get_frequency(
            '\'\'') + self.get_frequency('(') + self.get_frequency(')') + self.get_frequency(',') + self.get_frequency(
            '--') + self.get_frequency('.') + self.get_frequency(':')
        self.reduced_frequencies['symbols'] = self.get_frequency('SYM')

    def avg_word_length(self):
        return mean([len(i) for i in self.words])
Example #13
0
def unigramFreqFile(subreddit):
    # get filtered files
    filenames = getTextFileNames(subreddit)
    countFileName = getCountFileName(subreddit)
    with open(countFileName, "a+", errors='ignore') as countVectorFile:
        frequencies = FreqDist()
        for filename in filenames:
            print("sending normalized values of " + filename + " to " + countFileName)
            with open(filename, "r", errors="ignore") as current_file:
                for line in current_file:
                    for word in line.split():
                        word = word.strip()
                        if word.startswith("http") or word.isnumeric():
                            continue
                        if 0 < len(word) < 23:
                            frequencies[word] = frequencies.get(word, 0) + 1

        frequencies["<end_comment>"] = 0
        # write total number of words
        countVectorFile.write(str(frequencies.N()))
        for word in frequencies:
            countVectorFile.write(word+" "+str(frequencies[word])+"\n")
Example #14
0
def bigramFreqFile(subreddit):
    #get filtered files
    filenames = getTextFileNames(subreddit)
    countfilename = getCountFileName(subreddit, unigram=False)
    with open(countfilename, "a+", errors='ignore') as countVectorFile:
        frequencies = FreqDist()

        #good canidate for multithreading. one thread for file, each with own freq dist, combo after all finish.
        for filename in filenames:
            print("sending normalized values of " + filename + " to " + countfilename)
            with open(filename, "r", errors="ignore") as current_file:
                for line in current_file:
                    for bigram in list(bigrams(line.split())):
                        okayrange = 0 < len(bigram[0])  < 23 and 0 < len(bigram[1]) < 23
                        if okayrange and bigram[1] != "<end_comment>":
                            frequencies[bigram] = frequencies.get(bigram, 0) + 1

        #write total number of words
        countVectorFile.write(str(frequencies.N()))

        #note, another good improvement, organize this for faster searching.
        for bigram in frequencies:
            countVectorFile.write(" ".join(bigram)+" "+str(frequencies[bigram]))
Example #15
0
    def get_feat_basic_text(self, text):
        from nltk import FreqDist
        import numpy as np
        try:
            tokens = nltk.word_tokenize(text)
            tags = nltk.pos_tag(tokens)
            set_tokens = set(tokens)
            n = len(tokens)
            F = FreqDist(tokens)
            Ftags = FreqDist(tags)

            k = 0
            for st in set_tokens:
                freq_w = F.get(st)
                k += (freq_w * (np.math.log10(n) - np.math.log10(freq_w)))

            e = (1 / len(tokens)) * k

            # exclamations
            nr_exclamations = 0
            nr_quotation_mark = 0
            nr_comma = 0
            nr_dot = 0
            for s in tokens:
                if s == '!':
                    nr_exclamations += 1
                elif s == ',':
                    nr_comma += 1
                elif s == '.':
                    nr_dot += 1
                elif s == '?':
                    nr_quotation_mark += 1

            #nr_sent_pos = 0
            #nr_sent_neg = 0
            #nr_sent_neu = 0
            sent_tokenize_list = nltk.sent_tokenize(text)
            #for s in sent_tokenize_list:
            #    x = self.get_feat_sentiment(s)
            #    if x > .5:
            #        nr_sent_pos += 1
            #    elif .4 <= x <= .5:
            #        nr_sent_neu += 1
            #    else:
            #        nr_sent_neg += 1

            pos = [['NN', 'NNP'], ['VB', 'VBN', 'VBG', 'VBD'], ['DT'], ['JJ'],
                   ['RB']]
            freq_pos = []
            for pp in pos:
                y = 0
                for p in pp:
                    try:
                        y += Ftags.get(p)
                    except:
                        y += 0
                freq_pos.append(y)

            ret = [
                e,
                len(sent_tokenize_list),
                len(tokens),
                len(set_tokens), nr_exclamations, nr_quotation_mark, nr_comma,
                nr_dot
            ]
            ret.extend(freq_pos)  # nr_sent_pos, nr_sent_neu, nr_sent_neg,

            return ret, False

        except Exception as e:
            config.logger.error(repr(e))
            return MISSING_FEATURE * 13, True
                               aggfunc=np.sum)
# pivot_by_freq = pivot_by_freq.reset_index()
ret_axes: Axes = pivot_by_freq.plot(kind='barh')
ret_axes.plot(pivot_by_freq['num_words'], list(ret_axes.get_yticks()))
ret_axes.get_legend().remove()
ret_axes.set_xlabel("Bin wise cumulative word numbers")
ret_axes.set_title("Whole corpus")
ret_axes.set_axisbelow(True)
ret_axes.grid(True)
plt.show()

########## Inspect by word length ############
len_counter = FreqDist()
len_to_freq_dict = {}
for key in vocab:
    len_counter[len(key)] = len_counter.get(len(key), 0) + 1
    len_to_freq_dict[len(key)] = len_to_freq_dict.get(len(key), 0) + vocab[key]

len_df = pd.DataFrame.from_records(list(len_counter.items()),
                                   columns=['word_length', 'num_words'])
len_df.sort_values('word_length', inplace=True)
len_df = len_df.merge(pd.DataFrame.from_records(
    list(len_to_freq_dict.items()), columns=['word_length', 'frequency']),
                      on='word_length',
                      how='inner',
                      sort=True)

ret_axes: Axes = len_df.plot(x='word_length',
                             y='num_words',
                             marker='o',
                             color='blue',
Example #17
0
class NGramModel:
    def __init__(self, n):
        self.n = n
        self.tokens_arr = []
        self.freq_dist = FreqDist()

    def train(self, tokens_arr):
        self.tokens_arr = tokens_arr
        ngrams = self.get_ngrams()
        self.freq_dist = FreqDist(ngrams)

    def get_freq(self, ngram):
        if (self.freq_dist.get(ngram) is None):
            return 1
        else:
            return self.freq_dist.get(ngram) + 1

    def get_ngrams(self):
        unigrams = []
        bigrams = []
        trigrams = []
        print_status("Creating n-grams...")
        j = 0
        for sent in self.tokens_arr:
            words = list(
                pad_sequence(sent,
                             pad_left=True,
                             left_pad_symbol="<s>",
                             pad_right=True,
                             right_pad_symbol="</s>",
                             n=self.n))
            ngrams = list(everygrams(words, max_len=self.n))
            for ngram in ngrams:
                if (len(ngram) == 1 and self.n == 2):
                    unigrams.append(ngram)
                if (len(ngram) == 2 and self.n <= 3):
                    bigrams.append(ngram)
            if j % (len(self.tokens_arr) / 10) == 0:
                print(f"token {j} of {len(self.tokens_arr)}")
            j += 1
        return unigrams + bigrams + trigrams

    def load_ngrams_freq(self, freq_dist):
        self.freq_dist = freq_dist

    def get_word_log_prob(self, s, word_index):
        prob = 0
        if (self.n == 2):
            if (word_index == 0):
                bigram = ('<s>', s[word_index])
                unigram = ('<s>', )
            else:
                bigram = (s[word_index - 1], s[word_index])
                unigram = (s[word_index - 1], )
            prob = self.get_freq(bigram) / self.get_freq(unigram)
        elif (self.n == 3):
            if (word_index == 0):
                trigram = ('<s>', '<s>', s[word_index])
                bigram = ('<s>', '<s>')
            elif (word_index == 1):
                trigram = ('<s>', s[word_index - 1],
                           s[word_index]) if len(s) >= 2 else ('<s>',
                                                               s[word_index -
                                                                 1], '</w>')
                bigram = ('<s>', s[word_index - 1])
            else:
                trigram = (s[word_index - 2], s[word_index - 1], s[word_index])
                bigram = (s[word_index - 2], s[word_index - 1])
            prob = self.get_freq(trigram) / self.get_freq(bigram)
        return np.log(prob)
Example #18
0
    txt = fp.read()
    for i, sent in enumerate(sent_tokenize(txt)):
        for chunk in ne_chunk(pos_tag(word_tokenize(sent))):
            if hasattr(chunk, 'label') and chunk.label() == 'ORGANIZATION':
                org = ' '.join(c[0] for c in chunk)
                if org == phrase:
                    print('Organization:', org, 'File:', file,
                          'Sentence Number:', i)
                    print(sent)
                    content.append(file)
                # print the location and sentensence if neccessary
                # if org == phrase:
                #     print('Organization:', org, 'File:', file, 'Sentence Number:', i)
                #     print(sent)
    fp.close()
pd.DataFrame(content).to_csv('test.csv')

# rank the frequency and get the 5 highest ones
fdist = FreqDist(results)
frequents = [tag for (tag, _) in fdist.most_common(10)]

print(frequents)
for frequent in frequents:
    print(frequent, fdist.get(frequent))
tag = [tag for (tag, _) in fdist.most_common()]
num = [num for (_, num) in fdist.most_common()]
out = pd.DataFrame()
out['Organization'] = tag
out['Counts'] = num
out.to_csv('NLTK_ORG1.csv')
class LabelProducer(object):

    __TOTAL_ARTICLES = 5696797  # Number of articles in wikipedia. Used for tf-idf calculation
    __TERM_FREQUENCIES_DICT = {
    }  # Term frequencies for words. Used for tf-idf calculation
    __EXTRACTS_PER_TERM = 20  # Number of document intros to process per search term pair
    __STOP_WORDS = {}  # Words to filter out from results

    def __init__(self) -> None:
        super().__init__()
        nltk.download('stopwords')
        nltk.download('brown')
        self.__STOP_WORDS = stopwords.words('english')
        frequencies_file_name = '../data/frequency.pickle'
        self.__load_term_frequency_dict(frequencies_file_name)

    def __load_term_frequency_dict(self, frequencies_file_name):
        """
        Create or load the frequency dictionary that is used in tf-idf calculation
        to speed up the calculation
        
        Args:
            The file name from which to load and save the frequency dict
        """
        try:
            print('Loading word frequencies dictionary...')
            self.__TERM_FREQUENCIES_DICT = pickle.load(
                open(frequencies_file_name, 'rb'))
        except:
            print(
                'Word frequencies dictionary doesn\'t exist; Creating it and saving it to file...'
            )
            self.__TERM_FREQUENCIES_DICT = FreqDist(word.lower()
                                                    for word in brown.words())
            os.makedirs(os.path.dirname(frequencies_file_name), exist_ok=True)
            with open(frequencies_file_name, 'wb') as handle:
                pickle.dump(self.__TERM_FREQUENCIES_DICT,
                            handle,
                            protocol=pickle.HIGHEST_PROTOCOL)

        print('Finished loading word frequencies dictionary')

    def __get_wiki_page_list(self, term):
        """
        Get a relevant page list for the given term
        
        Args:
            term: a string to use for querying wikipedia to get relevant documents
        
        Returns:
            A list of page titles related to the given term
        """
        page_titles = []
        page_titles = wikipedia.search(term)[:self.__EXTRACTS_PER_TERM]
        return page_titles

    def __get_pages(self, term):
        """
        Query Wikipedia for the given term and return sentences that
        contain all of the words in the term
        
        Args:
            term: the term to query for
            
        Returns:
            List of sentences found that contain all the words
        """
        titles = self.__get_wiki_page_list(term=term)
        result = []

        if len(titles) > 0:
            res = requests.get(
                "https://en.wikipedia.org/w/api.php?format=json&action=query&prop=extracts&exlimit=max&exintro&explaintext&formatversion=2&titles="
                + "|".join(titles))
            term_tokens = term.split(' ')
            pages = res.json()['query']['pages']
            for item in pages:
                try:
                    page = item['extract'].lower()
                    sentences = list(
                        filter(lambda x: len(x) > len(term), page.split(".")))
                    for sentence in sentences:
                        contains_all = True
                        for term_token in term_tokens:
                            if term_token not in sentence:
                                contains_all = False
                                break
                        if contains_all == True:
                            result += [sentence]
                except Exception as e:
                    print(e)
        return result

    def __get_term_frequency(self, term):
        """
        Get the frequency of the given term
        
        Args:
            term: the term to get the frequency of
        
        Returns:
            The frequency of the term from the frequencies dict. 0 if it doesn't
            exist in th the dict
        """
        return self.__TERM_FREQUENCIES_DICT.get(term, 0)

    def __get_tf_idf(self, term, count):
        """
        Calculate the tf/idf of the given term
        
        Args:
            term: the term to calculate tf/idf of
            count: the term frequency
            
        Returns:
            The tf/idf value for the given term
        """
        hit_docs = self.__get_term_frequency(term)
        if hit_docs == 0:
            return 0
        idf = math.log(self.__TOTAL_ARTICLES / hit_docs)
        tf_idf = count * idf
        return tf_idf

    def __filter_words(self, text):
        """
        Filter out words from the given text
        
        Args:
            text: the text to filter
        
        Returns:
            All the words in the text that are not in the stop words list, have
            length greater than 1 and are alphabetical characters
        """
        return [
            word for word in text if word not in self.__STOP_WORDS
            and len(word) > 1 and word.isalpha()
        ]

    def __tokenize(self, text):
        """
        Tokenize the fiven text
        
        Args:
            text: the text to tokenize
            
        Returns:
            text after tokenization, conversion to lowercase and filtering of words
            using the filterWords function
        """
        return self.__filter_words(nltk.word_tokenize(text))

    def __term_counter(self, term):
        """
        Count the occurrences of different words in the sentences that contain
        all the words of the given term
        
        Args:
            term: the term to query for
            
        Returns:
            A counter of occurrences of all the unfiltered words that returned from
            the given query
        """
        res = self.__get_pages(term)
        result_words = []
        for doc in res:
            result_words += self.__tokenize(doc.lower())
        counter = Counter(result_words)
        return counter

    def calculate_most_probable_relations(self, terms, topn=20):
        """
        Get the most probable labels for the given list of terms
        
        Args:
            terms: a list of 2-tuples which are terms to search labels for
            topn: number of labels to return
            
        Returns:
            A list of 'topn' labels which represent the relations between the words
            in the terms sorted in descending order of probablity
        """

        term_str = {' '.join(pair) for pair in terms}

        terms_set = {
            word.lower()
            for term in term_str for word in term.split()
        }

        term_counters = [self.__term_counter(x) for x in term_str]
        all_terms = {k for d in term_counters for k in d.keys()} - terms_set
        all_dict = {}

        for term in all_terms:
            product = 1
            for termc in term_counters:
                current_count = termc.get(term)
                if (current_count != None):
                    product *= self.__get_tf_idf(term, current_count)
            all_dict[term] = product

        sorted_result = sorted(all_dict.items(),
                               key=operator.itemgetter(1),
                               reverse=True)

        result = list(
            map(lambda x: x[0],
                list(filter(lambda pair: pair[1] > 0, sorted_result))[:topn]))

        return result
Example #20
0
class NGramModel:
    def __init__(self, n):
        self.n = n
        self.tokens_dict = dict()
        self.freq_dist = FreqDist()

    def train(self, tokens_dict):
        self.tokens_dict = tokens_dict
        ngrams = self.get_ngrams()
        self.freq_dist = FreqDist(ngrams)

    def get_freq(self, ngram):
        if (self.freq_dist.get(ngram) is None):
            return 1
        else:
            return self.freq_dist.get(ngram) + 1

    def get_ngrams(self):
        unigrams = []
        bigrams = []
        trigrams = []
        fourgrams = []
        fivegrams = []
        sixgrams = []
        print_status("Creating n-grams...")
        j = 0
        for token in self.tokens_dict.keys():
            if type(token) is float:
                print(f"ERROR : unknown token {token}")
                continue
            chars = list(
                pad_sequence(str(token),
                             pad_left=True,
                             left_pad_symbol="<w>",
                             pad_right=True,
                             right_pad_symbol="</w>",
                             n=self.n))
            ngrams = list(everygrams(chars, max_len=self.n))
            for ngram in ngrams:
                if (len(ngram) == 1 and self.n == 2):
                    for i in range(self.tokens_dict[token]):
                        unigrams.append(ngram)
                if (len(ngram) == 2 and self.n <= 3):
                    for i in range(self.tokens_dict[token]):
                        bigrams.append(ngram)
                if (len(ngram) == 3 and self.n <= 4):
                    for i in range(self.tokens_dict[token]):
                        trigrams.append(ngram)
                if (len(ngram) == 4 and self.n <= 5):
                    for i in range(self.tokens_dict[token]):
                        fourgrams.append(ngram)
                if (len(ngram) == 5 and self.n <= 6):
                    for i in range(self.tokens_dict[token]):
                        fivegrams.append(ngram)
                if (len(ngram) == 6 and self.n <= 6):
                    for i in range(self.tokens_dict[token]):
                        sixgrams.append(ngram)
            if j % (len(self.tokens_dict) / 10) == 0:
                print(f"token {j} of {len(self.tokens_dict)}")
            j += 1
        return unigrams + bigrams + trigrams + fourgrams + fivegrams + sixgrams

    def load_ngrams_freq(self, freq_dist):
        self.freq_dist = freq_dist

    def get_word_log_prob(self, word):
        word_log_prob = 0
        if (self.n == 2):
            for i in range(len(word) + 1):
                if (i == 0):
                    bigram = ('<w>', word[i])
                    unigram = ('<w>', )
                elif (i == len(word)):
                    bigram = (word[i - 1], ('</w>', ))
                    unigram = (word[i - 1], )
                else:
                    bigram = (word[i - 1], word[i])
                    unigram = (word[i - 1], )
                prob = self.get_freq(bigram) / self.get_freq(unigram)
                word_log_prob += np.log(prob)

        elif (self.n == 3):
            for i in range(len(word) + 2):
                if (i == 0):
                    trigram = ('<w>', '<w>', word[i])
                    bigram = ('<w>', '<w>')
                elif (i == 1):
                    trigram = ('<w>', word[i - 1],
                               word[i]) if len(word) >= 2 else ('<w>',
                                                                word[i - 1],
                                                                '</w>')
                    bigram = ('<w>', word[i - 1])
                elif (i == len(word)):
                    trigram = (word[i - 2], word[i - 1], '</w>')
                    bigram = (word[i - 2], word[i - 1])
                elif (i == len(word) + 1):
                    trigram = (word[i - 2], '</w>', '</w>')
                    bigram = (word[i - 2], '</w>')
                else:
                    trigram = (word[i - 2], word[i - 1], word[i])
                    bigram = (word[i - 2], word[i - 1])
                prob = self.get_freq(trigram) / self.get_freq(bigram)
                word_log_prob += np.log(prob)

        elif (self.n == 4):
            for i in range(len(word) + 3):
                if (i == 0):
                    fourgram = ('<w>', '<w>', '<w>', word[i])
                    trigram = ('<w>', '<w>', '<w>')
                elif (i == 1):
                    fourgram = ('<w>', '<w>', word[i - 1],
                                word[i]) if len(word) >= 2 else ('<w>', '<w>',
                                                                 word[i - 1],
                                                                 '</w>')
                    trigram = ('<w>', '<w>', word[i - 1])
                elif (i == 2):
                    if len(word) == 1:
                        fourgram = ('<w>', word[i - 2], '</w>', '</w>')
                        trigram = ('<w>', word[i - 2], '</w>')
                    elif len(word) == 2:
                        fourgram = ('<w>', word[i - 2], word[i - 1], '</w>')
                        trigram = ('<w>', word[i - 2], word[i - 1])
                    else:
                        fourgram = ('<w>', word[i - 2], word[i - 1], word[i])
                        trigram = ('<w>', word[i - 2], word[i - 1])
                elif (i == len(word)):
                    fourgram = (word[i - 3], word[i - 2], word[i - 1], '</w>')
                    trigram = (word[i - 3], word[i - 2], word[i - 1])
                elif (i == len(word) + 1):
                    fourgram = (word[i - 3], word[i - 2], '</w>', '</w>')
                    trigram = (word[i - 3], word[i - 2], '</w>')
                elif (i == len(word) + 2):
                    fourgram = (word[i - 3], '</w>', '</w>', '</w>')
                    trigram = (word[i - 3], '</w>', '</w>')
                else:
                    fourgram = (word[i - 3], word[i - 2], word[i - 1], word[i])
                    trigram = (word[i - 3], word[i - 2], word[i - 1])
                prob = self.get_freq(fourgram) / self.get_freq(trigram)
                word_log_prob += np.log(prob)

        elif (self.n == 5):
            for i in range(len(word) + 4):
                if (i == 0):
                    fivegram = ('<w>', '<w>', '<w>', '<w>', word[i])
                    fourgram = ('<w>', '<w>', '<w>', '<w>')
                elif (i == 1):
                    fivegram = ('<w>', '<w>', '<w>', word[i - 1],
                                word[i]) if len(word) >= 2 else ('<w>', '<w>',
                                                                 '<w>',
                                                                 word[i - 1],
                                                                 '</w>')
                    fourgram = ('<w>', '<w>', '<w>', word[i - 1])
                elif (i == 2):
                    if len(word) == 1:
                        fivegram = ('<w>', '<w>', word[i - 2], '</w>', '</w>')
                        fourgram = ('<w>', '<w>', word[i - 2], '</w>')
                    elif len(word) == 2:
                        fivegram = ('<w>', '<w>', word[i - 2], word[i - 1],
                                    '</w>')
                        fourgram = ('<w>', '<w>', word[i - 2], word[i - 1])
                    else:
                        fivegram = ('<w>', '<w>', word[i - 2], word[i - 1],
                                    word[i])
                        fourgram = ('<w>', '<w>', word[i - 2], word[i - 1])
                elif (i == 3):
                    if len(word) == 1:
                        fivegram = ('<w>', word[i - 3], '</w>', '</w>', '</w>')
                        fourgram = ('<w>', word[i - 3], '</w>', '</w>')
                    elif len(word) == 2:
                        fivegram = ('<w>', word[i - 3], word[i - 2], '</w>',
                                    '</w>')
                        fourgram = ('<w>', word[i - 3], word[i - 2], '</w>')
                    elif len(word) == 3:
                        fivegram = ('<w>', word[i - 3], word[i - 2],
                                    word[i - 1], '</w>')
                        fourgram = ('<w>', word[i - 3], word[i - 2],
                                    word[i - 1])
                    else:
                        fivegram = ('<w>', word[i - 3], word[i - 2],
                                    word[i - 1], word[i])
                        fourgram = ('<w>', word[i - 3], word[i - 2],
                                    word[i - 1])
                elif (i == len(word)):
                    fivegram = (word[i - 4], word[i - 3], word[i - 2],
                                word[i - 1], '</w>')
                    fourgram = (word[i - 4], word[i - 3], word[i - 2],
                                word[i - 1])
                elif (i == len(word) + 1):
                    fivegram = (word[i - 4], word[i - 3], word[i - 2], '</w>',
                                '</w>')
                    fourgram = (word[i - 4], word[i - 3], word[i - 2], '</w>')
                elif (i == len(word) + 2):
                    fivegram = (word[i - 4], word[i - 3], '</w>', '</w>',
                                '</w>')
                    fourgram = (word[i - 4], word[i - 3], '</w>', '</w>')
                elif (i == len(word) + 3):
                    fivegram = (word[i - 4], '</w>', '</w>', '</w>', '</w>')
                    fourgram = (word[i - 4], '</w>', '</w>', '</w>')
                else:
                    fivegram = (word[i - 4], word[i - 3], word[i - 2],
                                word[i - 1], word[i])
                    fourgram = (word[i - 4], word[i - 3], word[i - 2],
                                word[i - 1])
                prob = self.get_freq(fivegram) / self.get_freq(fourgram)
                word_log_prob += np.log(prob)

        elif (self.n == 6):
            for i in range(len(word) + 5):
                if (i == 0):
                    sixgram = ('<w>', '<w>', '<w>', '<w>', '<w>', word[i])
                    fivegram = ('<w>', '<w>', '<w>', '<w>', '<w>')
                elif (i == 1):
                    sixgram = ('<w>', '<w>', '<w>', '<w>', word[i - 1],
                               word[i]) if len(word) >= 2 else ('<w>', '<w>',
                                                                '<w>', '<w>',
                                                                word[i - 1],
                                                                '</w>')
                    fivegram = ('<w>', '<w>', '<w>', '<w>', word[i - 1])
                elif (i == 2):
                    if len(word) == 1:
                        sixgram = ('<w>', '<w>', '<w>', word[i - 2], '</w>',
                                   '</w>')
                        fivegram = ('<w>', '<w>', '<w>', word[i - 2], '</w>')
                    elif len(word) == 2:
                        sixgram = ('<w>', '<w>', '<w>', word[i - 2],
                                   word[i - 1], '</w>')
                        fivegram = ('<w>', '<w>', '<w>', word[i - 2],
                                    word[i - 1])
                    else:
                        sixgram = ('<w>', '<w>', '<w>', word[i - 2],
                                   word[i - 1], word[i])
                        fivegram = ('<w>', '<w>', '<w>', word[i - 2],
                                    word[i - 1])
                elif (i == 3):
                    if len(word) == 1:
                        sixgram = ('<w>', '<w>', word[i - 3], '</w>', '</w>',
                                   '</w>')
                        fivegram = ('<w>', '<w>', word[i - 3], '</w>', '</w>')
                    elif len(word) == 2:
                        sixgram = ('<w>', '<w>', word[i - 3], word[i - 2],
                                   '</w>', '</w>')
                        fivegram = ('<w>', '<w>', word[i - 3], word[i - 2],
                                    '</w>')
                    elif len(word) == 3:
                        sixgram = ('<w>', '<w>', word[i - 3], word[i - 2],
                                   word[i - 1], '</w>')
                        fivegram = ('<w>', '<w>', word[i - 3], word[i - 2],
                                    word[i - 1])
                    else:
                        sixgram = ('<w>', '<w>', word[i - 3], word[i - 2],
                                   word[i - 1], word[i])
                        fivegram = ('<w>', '<w>', word[i - 3], word[i - 2],
                                    word[i - 1])
                elif (i == 4):
                    if len(word) == 1:
                        sixgram = ('<w>', word[i - 4], '</w>', '</w>', '</w>',
                                   '</w>')
                        fivegram = ('<w>', word[i - 4], '</w>', '</w>', '</w>')
                    elif len(word) == 2:
                        sixgram = ('<w>', word[i - 4], word[i - 3], '</w>',
                                   '</w>', '</w>')
                        fivegram = ('<w>', word[i - 4], word[i - 3], '</w>',
                                    '</w>')
                    elif len(word) == 3:
                        sixgram = ('<w>', word[i - 4], word[i - 3],
                                   word[i - 2], '</w>', '</w>')
                        fivegram = ('<w>', word[i - 4], word[i - 3],
                                    word[i - 2], '</w>')
                    elif len(word) == 4:
                        sixgram = ('<w>', word[i - 4], word[i - 3],
                                   word[i - 2], word[i - 1], '</w>')
                        fivegram = ('<w>', word[i - 4], word[i - 3],
                                    word[i - 2], word[i - 1])
                    else:
                        sixgram = ('<w>', word[i - 4], word[i - 3],
                                   word[i - 2], word[i - 1], word[i])
                        fivegram = ('<w>', word[i - 4], word[i - 3],
                                    word[i - 2], word[i - 1])
                elif (i == len(word)):
                    sixgram = (word[i - 5], word[i - 4], word[i - 3],
                               word[i - 2], word[i - 1], '</w>')
                    fivegram = (word[i - 5], word[i - 4], word[i - 3],
                                word[i - 2], word[i - 1])
                elif (i == len(word) + 1):
                    sixgram = (word[i - 5], word[i - 4], word[i - 3],
                               word[i - 2], '</w>', '</w>')
                    fivegram = (word[i - 5], word[i - 4], word[i - 3],
                                word[i - 2], '</w>')
                elif (i == len(word) + 2):
                    sixgram = (word[i - 5], word[i - 4], word[i - 3], '</w>',
                               '</w>', '</w>')
                    fivegram = (word[i - 5], word[i - 4], word[i - 3], '</w>',
                                '</w>')
                elif (i == len(word) + 3):
                    sixgram = (word[i - 5], word[i - 4], '</w>', '</w>',
                               '</w>', '</w>')
                    fivegram = (word[i - 5], word[i - 4], '</w>', '</w>',
                                '</w>')
                elif (i == len(word) + 4):
                    sixgram = (word[i - 5], '</w>', '</w>', '</w>', '</w>',
                               '</w>')
                    fivegram = (word[i - 5], '</w>', '</w>', '</w>', '</w>')
                else:
                    sixgram = (word[i - 5], word[i - 4], word[i - 3],
                               word[i - 2], word[i - 1], word[i])
                    fivegram = (word[i - 5], word[i - 4], word[i - 3],
                                word[i - 2], word[i - 1])
                prob = self.get_freq(sixgram) / self.get_freq(fivegram)
                word_log_prob += np.log(prob)

        return word_log_prob
Example #21
0
def doc_features(doc):
    doc_words = FreqDist(w for w in doc if not isStopWord(w))
    features = {}
    for word in word_features:
        features['count (%s)' % word] = (doc_words.get(word, 0))
    return features
Example #22
0
def doc_features(doc):
    doc_words = FreqDist(w for w in doc if not isStopWord(w))
    features = {}
    for word in word_features:
        features['count (%s)' % word] = (doc_words.get(word, 0))
    return features
def remove_low_freq_phrases(keyPhrases, cutOff):
    """
    Purpose: To remove low frequency key phrases. These low frequency key phrases
            can muddy up the final results. Also, removes duplicate key phrases
            in the SAME comment so that 1 customer saying the SAME 
            THING over and over doesn't skew the results in their favor.
    
    Args:
        - A list of strings, such as key phrases. Each list entry contains key 
        phrases for 1 comment separated by a comma. Note, a key phrase can be 
        1-4 words long which will be taken into account below. 
        
        - Cutoff - an integer representing the frequency. If any key phrases 
        occur below that cut-off they will be removed. 
        
    Returns:
        A list of strings (key phrases) formatted the same as the input.
    
    Raises: Nothing.
    
    Example:
        keyPhrases[0] = 'gift card, restaurant week photo competition, two tables'
        In other words, 1 customer comment had all of the key phrases above. 
        
        cutOff = 2. In this example, "gift card" must appear in ALL key phrases 
        at least 2 times or else if will be deleted from the return list. 

    """

    freqWords = []
    bagOfphrases = []
    
    for entry in keyPhrases:
    
        tmpList = []    
    
        # Separate the comma separated word list into individual list entries.
        # This can result in key phrases that are 1 to 4 words long.
        strList = entry.split(sep = ',')
        
        # Ensure each key phrase is NOT repeated in the SAME comment. 
        # Don't want 1 comment to have the same key phrase multiple times 
        # skewing the frequency. Also, don't want to include single characters
        # for consideration. 
        for strEntry in strList:
            if strEntry not in tmpList and len(strEntry) > 1:
                bagOfphrases.append(strEntry)
                tmpList.append(strEntry)

    # Use FreqDist to count the number of times each phrase occurs. 
    fDist = FreqDist(bagOfphrases)
    
    # Loop through each entry in the keyPhrases. Get the frequency for that
    # key phrase. If it's above the cut-off then keep it, if not
    # remove it.
    for entry in keyPhrases:
    
        tmpList = []
        results = ""

        # Separate the comma separated word list into individual list entries.
        # This can result in key phrases that are 1 to 4 words long.
        strList = entry.split(sep = ',')
        
        # For each key phrase if it's equal to or above the frequency cut off 
        # then save it.
        for strEntry in strList: 
            if strEntry is not None:

                # Need to get the frequency first since a get() will return None
                # if the .get(key) is not found. Also ensure we don't duplicate 
                # key phrases for the same comment. 
                frequency = fDist.get(strEntry)
                if frequency is not None:
                    if strEntry not in tmpList and frequency >= cutOff:
                        results = results + ', ' + strEntry
                        tmpList.append(strEntry)
        
        # Remove the leading comma.
        if len(results) > 2:
            results = results[2:]
            results = results.strip()

        freqWords.append(results)
    
    return(freqWords)
Example #24
0
# print possibility of word
wordPos = [(x, dataRaw_pdist.prob(x)) for x in vocabRaw_tokens_nopunct]

# print possibility of UNK
KPos = 0
for y in vocabRaw_tokens_nopunct:
    KPos += dataRaw_pdist.prob(y)
UNKPos = [('UNK', (1 - KPos))]

wordPos.append(UNKPos[0])
#print(wordPos)
#print('UNK, ',UNKPos)

# after smoothing
_Pa = (dataRaw_fdist.get('a') + 1) / (len(dataRaw_tokens_nopunct) +
                                      len(vocabRaw_tokens_nopunct) + 1)
_Pb = (dataRaw_fdist.get('b') + 1) / (len(dataRaw_tokens_nopunct) +
                                      len(vocabRaw_tokens_nopunct) + 1)
_Pc = (dataRaw_fdist.get('c') + 1) / (len(dataRaw_tokens_nopunct) +
                                      len(vocabRaw_tokens_nopunct) + 1)
_Punk = 1 - _Pa - _Pb - _Pc
_Pa, _Pb, _Pc, _Punk

# calculate the bi_gram
s1 = '<s>'
s2 = '</s>'
Know = vocabRaw_tokens_nopunct[0] + vocabRaw_tokens_nopunct[
    1] + vocabRaw_tokens_nopunct[2]
vocabRaw_tokens_nopunct.append("[^" + Know + "]")
vocabRaw_tokens_nopunct.append(s1)
Example #25
0
class Text():
    '''
    >>> source = os.path.abspath(r"..\CORPUS\en")
    >>> sents = Text(Stream(Path(source,"*.txt")))
    >>> for sent in sents:
            print(sent)
    '''
    def __init__(
            self,
            intake,
            prep: Prep = None,
            clean: TextCleaner = None,
            filters: TokenFilter = None,
            inplace=False,
            datadir=None,
            encoding=chardetector,
            verbose=True,
            rewrite=False,
            loadas="pickle",
            saveas=("txt", "pickle"),
            input='filename'  # str {'filename', 'file', 'text'}
    ):

        self._path = ''
        self.filename = ''
        self.name = ''
        self.inplace = inplace
        self.verbose = verbose
        self.rewrite = rewrite
        self.loadas = loadas
        self.saveas = saveas
        self.encoding = 'unknown'

        if not datadir:
            self.datadir = os.path.join(
                os.path.abspath(os.path.dirname(__file__)), "data")
        else:
            self.datadir = os.path.abspath(datadir)

        self._encoding = None
        self._nwords = 0
        self._sents = []
        self._vocab = FreqDist()
        self._trie = dawg.RecordDAWG(">IH")

        if input == 'filename':
            self._path = intake
            #self.filename = os.path.basename(os.path.splitext(self._path)[0])
            self.filename = os.path.basename(self._path)
            self.name = os.path.splitext(self.filename)[0]
            self._encoding = encoding
            self._input = intake

            if not self.rewrite:
                if self.loadas == 'pickle':
                    self._sents = self.loadpickle('sents') or [
                    ]  # all sentences from the text
                    self._vocab = self.loadpickle('vocab') or FreqDist(
                    )  # all unique normalized words from the text
                    # итеративная загрузка словаря идет несколько секунд идет, поэтому быстрее
                    # (за доли секунды) прочитать его из pickle
                    #for sent in self._sents:
                    #    self._vocab += FreqDist(sent.lemmas())

                    self._trie = self.loaddawg('trie') or dawg.RecordDAWG(
                        ">IH")  #  prefix tree

        elif input == "text":
            self._input = io.StringIO(intake)
            self._path = ''
            self.filename = self._input.__class__.__name__
            self.name = self.filename

        elif input == "file":
            self._input = intake
            self._path = ''
            self.filename = self._input.__class__.__name__
            self.name = self.filename

        if self._sents:
            self._nwords = sum(map(lambda s: s.nwords, self._sents))

        self._prep = prep
        self._clean = clean
        self._filters = filters

        self._iter = self.__sents__()
        # close the generator if data is loaded
        if self._sents:
            self._iter.close()

        if self.inplace:
            if not self._sents:
                list(self._iter)

    def __sents__(self):

        encoding = self._encoding
        sentencizer = self._prep.sentencizer
        clean = self._clean
        path = self._input

        if self.loadas == 'txt' and self._path:
            path = datapath(self._path,
                            datadir=self.datadir,
                            ext=".tagged.txt").full
            if os.path.exists(path):
                encoding = 'utf-8'
                sentencizer = None
                clean = None

        stream = Stream(path, encoding=encoding)
        self.encoding = stream._encoding
        for num, sent in enumerate(stream(sentencizer, clean)):
            tagged_sent = TaggedSentence(sent.strip(), num, self._prep,
                                         self._filters)
            lemmas = tagged_sent.lemmas()
            # в этот словарь попадают все леммы,
            # так как здесь ничего не фильтруется
            self._vocab += FreqDist(lemmas)
            self._nwords += tagged_sent.nwords
            self._sents.append(tagged_sent)
            #self._words.extend(tagged_sent.words())

            yield tagged_sent

        data = ((token.word, (token.nsent, token.idx))
                for sent in self.sents() for token in sent.tokens(lower=True))
        self._trie = dawg.RecordDAWG(">IH", data)

    @property
    def embed(self):
        '''Доступ к Word2Vec для создания embedding'''
        return word2vec()

    @property
    def trie(self):
        '''Доступ к префиксному дереву текста'''
        return Trie(self._trie)

    """
    def trie(self, key=None, sort=True):
        '''Доступ к префиксному дереву текста'''
        if key is None:return self._trie
        
        res = self._trie.get(key,[])
        if sort:
            res.sort(key=lambda t:(t[0],[1]))
        return res 
            
    def startswith(self, affix):
        '''Поиск по префиксному дереву слов, 
        которые начинаются с указанного префикса'''
        return self._trie.keys(affix)
    
    @property
    def occur(self):
        return self._trie
    """

    @property
    def vocab(self):
        '''Доступ к вокабуляру лемм текста'''
        return self._vocab

    @property
    def nsents(self):
        '''Число предложений в тексте'''
        return len(self._sents)

    @property
    def nwords(self):
        '''Число слов в тексте'''
        return self._nwords

    @property
    def nlemmas(self):
        '''Число лемм в тексте'''
        return len(self._vocab)

    def _iter_by_sents(self, attr, filtrate=False, lower=False, pos=None):
        for sent in self.sents():
            tokens = sent.tokens(filtrate=filtrate, lower=lower, pos=pos)
            sent = []
            for token in tokens:
                sent.append(getattr(token, attr))
            yield sent

    def sents2words(self, filtrate=False, lower=False, pos=None):
        result = [
            sent for sent in self._iter_by_sents(
                attr="word", filtrate=filtrate, lower=lower, pos=pos)
        ]
        return result

    def sents2lemmas(self, filtrate=False, lower=False, pos=None):
        result = [
            sent for sent in self._iter_by_sents(
                attr="lemma", filtrate=filtrate, lower=lower, pos=pos)
        ]
        return result

    def sents(self, n_sent=None, max_words=None, min_words=None):
        if n_sent is not None:
            res = self._sents[n_sent]
        else:
            if min_words is not None and max_words is not None:
                res = [
                    sent for sent in self._sents
                    if min_words <= sent.nwords <= max_words
                ]
            else:
                if max_words is not None:
                    res = [
                        sent for sent in self._sents
                        if sent.nwords <= max_words
                    ]
                elif min_words is not None:
                    res = [
                        sent for sent in self._sents
                        if sent.nwords >= min_words
                    ]

                else:
                    res = self._sents
        return res

    def iterwords(self, filtrate=False, lower=True, pos=None):

        for sent in self._sents:
            tokens = sent.tokens(filtrate=filtrate, lower=lower, pos=pos)
            for token in tokens:
                yield token.word

    def iterlemmas(self, filtrate=False, lower=True, pos=None):

        for sent in self._sents:
            tokens = sent.tokens(filtrate=filtrate, lower=lower, pos=pos)
            for token in tokens:
                yield token.lemma

    # TODO: переделать на извлечение из trie
    def words(self, filtrate=False, lower=True, pos=None, uniq=False):
        result = [
            word
            for word in self.iterwords(filtrate=filtrate, lower=lower, pos=pos)
        ]

        if uniq:
            result = list(set(result))

        return result

    # TODO: переделать на извлечение из trie
    def lemmas(self, filtrate=False, lower=True, pos=None, uniq=False):
        result = [
            lemma for lemma in self.iterlemmas(
                filtrate=filtrate, lower=lower, pos=pos)
        ]

        if uniq:
            result = list(set(result))

        return result

    def tags(self):
        result = []
        for sent in self._sents:
            result.extend(sent.tags)

        return result

    def postags(self,
                pos=None,
                sort=False,
                top=0,
                universal_tagset=False,
                ret_cond=False):
        '''Создает частотные словари или отсортированные по частоте списки
        частей речи'''
        def merge(tags):
            result = FreqDist()
            for tag in tags:
                result += cfd[tag]
            return result

        maps = {
            'NOUN': {'NN', 'NNS', 'NNP', 'NNPS'},
            'VERB': {'VB', 'VBD', 'VBG', 'VBN', 'VBP', 'VBZ'},
            'ADJ': {'JJ', 'JJR', 'JJS'},
            'ADV': {'RB', 'RBR'
                    'RBS'},
        }

        cfd = ConditionalFreqDist()

        for sent in self._sents:
            #tokens = sent.untagging()
            tokens = sent.tags
            for tok, tag, lemma in tokens:
                cfd[tag][lemma.lower()] += 1
        cond = cfd.conditions()

        result = cfd

        if pos:
            if not universal_tagset and pos in maps:
                result = merge(maps[pos])
            else:
                result = cfd[pos]

        if top:
            result = _top(result, top)
        else:
            result = _sort(result, sort)

        if ret_cond:
            result = result, cond

        return result

    '''    
    def postags2(self, pos=None):
        words = []
        for sent in self._sents:
            words.extend(sent.untagging())    
        words.sort()
        tags = defaultdict(list)
        for key, group in groupby(words, lambda make: make[1]):
            tags[key].extend([l for t,p,l in group])
        
        if pos:
           return tags.get(pos)
        return tags
    '''

    def suffix(self, affix):
        '''Поиск по суффиксному дереву'''
        pass

    def stats(self):
        '''Всевозможная статистика по тексту'''
        pass

    def count(self, token=None, words=True, uniq=False, lower=True):

        if words:
            if token:
                # общее число вхождений слова в текст
                result = len(self._trie.get(token, 0))
                return result
            #-----------------------------------
            # число уникальных слов
            if uniq:
                # вот здесь придется сначала получить все слова
                # и только потом узнать сколько их по одному вхождению,
                # так как эта информация нигде не хранится
                result = len(self.words(uniq=True, lower=lower))
            # общее число вхождений всех словоформ
            else:
                result = self.nwords
        #--------------------------------
        else:
            # по леммам
            if token:
                # общее число вхождений леммы в текст
                result = self._vocab.get(token, 0)
                return result
            # число уникальных лемм
            if uniq:
                result = len(self._vocab)
            # общее число вхождений всех лемм
            else:
                result = sum(self._vocab.values())

        return result

    def keywords(
        self,
        by='words',
        rating=('rake',
                dict(max_words=4,
                     stopwords=nltk.corpus.stopwords.words('english')))):

        sents = []

        for sent in self._sents:
            tokens = sent.words() if by == 'words' else sent.lemmas()
            sents.append(tokens)

        if rating[0] == 'rake':
            rake = Rake(sents, **rating[1])
            result = rake

        elif rating[0] == 'textrank':
            # нереализовано, так как используемый класс TextRank
            # создает оценки только для предложений
            pass

        return result

    # на построение графа в TextRank уходит много памяти для больших текстов
    # (> 20 тысяч словоупотреблений)!
    def summarize(self, top=7, scores=True):

        words = [set(sent.lemmas(uniq=True)) for sent in self.sents()]
        textrank = TextRank(words, self.nsents)

        if top:
            result = textrank.topn(n=top)
            if scores:
                result = [(score, self._sents[idx].raw)
                          for idx, score in result]
            else:
                result = [self._sents[idx].raw for idx, score in result]

        else:
            result = textrank

        return result

    def doc2bow(self):
        pass

    def ngrams(self, n, words=False, filtrate=False, lower=True, **kwargs):
        method = self.words if words else self.lemmas
        yield from nltk.ngrams(method(filtrate=filtrate, lower=lower), n,
                               **kwargs)

    def skipgrams(self,
                  n,
                  k,
                  words=False,
                  filtrate=False,
                  lower=True,
                  **kwargs):
        method = self.words if words else self.lemmas
        yield from nltk.skipgrams(method(filtrate=filtrate, lower=lower), n, k,
                                  **kwargs)

    def collocations(self):
        pass

    def hapaxes(self, words=False, filtrate=False):
        '''Метод извлекающий из текста слова-одиночки'''
        if not words:
            # ищем в леммах
            res = self._vocab
        else:
            res = FreqDist(self.words(filtrate=filtrate))
        return res.hapaxes()

    @property
    def speller(self):
        return self._prep.speller()

    def _validpath(self, path):
        return os.path.exists(path)

    def loadpickle(self, name, path=None):
        path_ = path or datapath(self._path, datadir=self.datadir).short
        path = '{}.{}.pickle'.format(path_, name)

        if self._validpath(path):
            if self.verbose:
                print('loading pickle:'.ljust(16),
                      path.replace(nlptk.MODULEDIR, '..'))

            with open(path, 'rb') as f:
                obj = pickle.load(f)
        else:
            obj = None
        return obj

    def loaddawg(self, name, path=None):
        path_ = path or datapath(self._path, datadir=self.datadir).short
        path = '{}.{}.dawg'.format(path_, name)

        if self._validpath(path):
            if self.verbose:
                print('loading dawg:'.ljust(16),
                      path.replace(nlptk.MODULEDIR, '..'))

            d = dawg.RecordDAWG(">IH")
            obj = d.load(path)
        else:
            obj = None
        return obj

    def savedawg(self, name, path=None):
        path_ = path or datapath(self._path, datadir=self.datadir).short
        # сохранение словаря для префиксного дерева
        path = '{}.{}.dawg'.format(path_, name)
        if self.verbose:
            print('saving dawg:'.ljust(16),
                  path.replace(nlptk.MODULEDIR, '..'))
        self._trie.save(path)

    def save(self, path=None, as_=("txt", "pickle")):
        if not os.path.exists(self.datadir):
            os.mkdir(self.datadir)

        path_ = path or datapath(self._path, datadir=self.datadir).short

        saveas = self.saveas or as_
        if not isinstance(saveas, (tuple, list)):
            saveas = (saveas, )

        for fmt in saveas:
            if fmt == "txt":
                path = '{}.tagged.txt'.format(path_)

                if self.verbose:
                    print('saving txt:'.ljust(16),
                          path.replace(nlptk.MODULEDIR, '..'))

                with open(path, 'w', encoding='utf8') as f:
                    f.writelines('\n'.join(map(str, self._sents)))

            elif fmt == 'pickle':
                path = '{}.sents.pickle'.format(path_)

                if self.verbose:
                    print('saving pickle:'.ljust(16),
                          path.replace(nlptk.MODULEDIR, '..'))

                with open(path, 'wb') as f:
                    pickle.dump(self._sents, f)
                path = '{}.vocab.pickle'.format(path_)

                with open(path, 'wb') as f:
                    pickle.dump(self._vocab, f)

        self.savedawg('trie', path_)

    def __iter__(self):
        return self._iter

    def __next__(self):
        return next(self._iter)

    def __str__(self):
        return '\n'.join([str(sent) for sent in self.sents()])

    def __repr__(self):
        fmt = ("Text(\n\tname='{}',\n\tencoding='{}',\n\t"
               "nsents={},\n\tnwords={},\n\tnlemmas={}\n)")
        return fmt.format(self.name, self.encoding, self.nsents, self.nwords,
                          self.nlemmas)