def __init__(self, text):
     self.raw_text = text
     self.raw_text_length = len(text)
     self.number_of_letters = len(
         [x for x in self.raw_text if x.isalpha() or x.isdigit()])
     self.words = StylometryExtractor.TOKENIZER.tokenize(self.raw_text)
     self.tokens = word_tokenize(self.raw_text)
     self.number_of_words = len(self.words)
     self.number_of_tokens = len(self.tokens)
     #         self.text = Text(word_tokenize(self.raw_text))
     self.words_frequency = FreqDist(Text(self.words))
     self.tokens_frequency = FreqDist(Text(self.tokens))
     self.chars_counter = FreqDist(self.raw_text)
     self.lemmatizer = WordNetLemmatizer()
     self.lemmatized_words_frequency = FreqDist(
         Text([self.lemmatizer.lemmatize(word) for word in self.words]))
     self.sentences = sent_tokenize(self.raw_text)
     self.number_of_sentences = len(self.sentences)
     self.sentence_chars = [len(sent) for sent in self.sentences]
     self.sentence_word_length = [
         len(sent.split()) for sent in self.sentences
     ]
     self.paragraphs = [
         p for p in self.raw_text.split("\n\n")
         if len(p) > 0 and not p.isspace()
     ]
     self.paragraph_word_length = [len(p.split()) for p in self.paragraphs]
     self.all_trigrams = self._all_trigrams()
     self.all_fourgrams = self._all_fourgrams()
     self.ngram_string = self._to_ngram_string()
     self.features = self._to_dict()
     self.feature_names = list(self.features.keys())
Esempio n. 2
0
    def __init__(self, file_content, author=DEFAULT_AUTHOR):
        self.author = author.strip()
        self.raw_content = file_content
        self.file_content = file_content.lower()
        self.tokens = PortugueseTextualProcessing.tokenize(self.file_content)
        self.text = Text(self.tokens)
        self.fdist = FreqDist(self.text)
        self.sentences = sent_tokenize(self.file_content, language='portuguese')
        self.sentence_chars = [len(sent) for sent in self.sentences]
        self.sentence_word_length = [len(sent.split()) for sent in self.sentences]
        self.paragraphs = [p for p in self.file_content.split("\n\n") if len(p) > 0 and not p.isspace()]
        self.paragraph_word_length = [len(p.split()) for p in self.paragraphs]
        self.punctuation = [".", ",", ";", "-", ":"]
        self.ner_entities = ['ABSTRACCAO', 'ACONTECIMENTO', 'COISA', 'LOCAL',
                             'ORGANIZACAO', 'OBRA', 'OUTRO', 'PESSOA', 'TEMPO', 'VALOR']
        self.white_spaces = len(self.file_content.split(' '))

        self.rich_tags = RichTags(PortugueseTextualProcessing.get_rich_tags(self.file_content), len(self.text))
        self.tagged_sentences = PortugueseTextualProcessing.postag(self.tokens)
        self.tagfdist = FreqDist([b for [(a, b)] in self.tagged_sentences])
        self.ner_tags = PortugueseTextualProcessing.ner_chunks(self.tokens)
        self.ner_ftags = FreqDist(self.ner_tags)
        self.spell = SpellChecker(language='pt')
        self.ROUNDING_FACTOR = 4
        self.LINE_BREAKS = ['\n', '\t', '\r']
Esempio n. 3
0
def concord(nome, arquivo):
    """
    ??????
    """
    # Entrada
    nome_p = arquivo
    nome_e = str(nome)
    #print(nome_e,nome_p) #Para bugs

    # Abertura do aquivo a ser lido
    arquivo_e = open(
        "/tmp/concordancia/{arquivo}".format(arquivo=nome_e)).read()

    # tokenização do arquivo
    token = word_tokenize(arquivo_e)

    texto = Text(token)

    #texto.concordance(nome_p)

    # Abertura do arquivo de saida
    arquivo_s = open("/tmp/concordancia/saida.txt", "w")

    saida = concordance_2_txt(nome_p, token)

    for x in saida:
        arquivo_s.write(("%s\n") % (x))

    arquivo_s.close()

    #Remove o arquivo do upload
    file_remove("concordancia", nome_e)
Esempio n. 4
0
 def meaning_analysis(self, sentence_analysis):
     final_str = ''
     tokens = word_tokenize(sentence_analysis)
     text = Text(tokens)
     tags = pos_tag(text)
     print(tags)
     nouns = "NN NNP PRP NNS".split()
     verbs = "VB VBD VBP VBG".split()
     questions_nouns = "WP WRB".split()
     if len(tags) == 1:
         tag_len = len(tags) + 1
     else:
         tag_len = len(tags)
     for i in range(tag_len - 1):
         if tags[i][1] in verbs:
             if i + 1 < len(tags) and (tags[i + 1][1] in nouns
                                       or tags[i + 2][1] in nouns
                                       or tags[i + 3][1] in nouns
                                       or tags[i + 4][1] in nouns):
                 print("its a decision")
             else:
                 print("unknown decision")
             #print("yoo1")
             final_str = "decision"
             break
         elif tags[i][1] in questions_nouns:
             final_str = "question"
             break
         else:
             final_str = "question asked"
     return final_str
def tokenizer():
    fileids = cicero.abspaths()
    reader = CategorizedXMLCorpusReader('/',
                                        fileids,
                                        cat_file='categories.txt')
    tokens = Text(reader.words(fileids))
    return tokens
 def get_count(self, obj: SearchResult) -> int:
     sentences = self.get_sentences(obj)
     tokens = [word_tokenize(sentence) for sentence in sentences]
     flat_tokens = list(itertools.chain(*tokens))
     lower_case_tokens = [token.lower() for token in flat_tokens]
     text = Text(lower_case_tokens)
     return text.count(obj.search_term.lower())
Esempio n. 7
0
 def save_most_popular_words_analysis(self, most_common_quantity):
     articles_tokens = list()
     for (codex_type, _) in tqdm(self.parser.codex_urls):
         raw_articles_info = self.parser.sorted_articles_info[codex_type]
         for article_info in tqdm(raw_articles_info):
             text = self.parser.get_article_text_by_id(article_info.id)
             text = text.lower()
             text = self.remove_chars_from_text(text, self.spec_chars)
             article_tokens = word_tokenize(' '.join(
                 self.mystem.lemmatize(text)))
             for stop_word in self.stop_words:
                 while stop_word in article_tokens:
                     article_tokens.remove(stop_word)
             articles_tokens.extend(article_tokens)
     text = Text(articles_tokens)
     f_dist = FreqDist(text)
     if os.path.exists(self.config['most_popular_words_analysis_file']):
         os.remove(self.config['most_popular_words_analysis_file'])
     with open(self.config['most_popular_words_analysis_file'],
               mode='w') as most_popular_words_analysis_file:
         most_popular_words_analysis_writer = csv.writer(
             most_popular_words_analysis_file,
             delimiter=',',
             quotechar='"',
             quoting=csv.QUOTE_MINIMAL)
         most_popular_words_analysis_writer.writerow(
             ['word', 'word_count', 'frequency'])
         for info in f_dist.most_common(most_common_quantity):
             most_popular_words_analysis_writer.writerow(
                 [info[0], info[1], info[1] / len(articles_tokens)])
Esempio n. 8
0
    def read_text_sections(self, filename):
        tokens = []

        file_handle = open(filename, "r")

        for line in file_handle:
            try:

                file_tokens = word_tokenize(line)
                file_tokenized_text = Text(file_tokens)
                stop_words = stopwords.words('english')

                # Clear Stop words in the tokens and special characters.
                for token in file_tokenized_text:
                    lower_str = token.lower()
                    if lower_str not in stop_words and re.match(regex_clear, lower_str) and len(lower_str) > 2\
                            and not(lower_str.isdigit()):
                        tokens.append(lower_str)

            except UnicodeDecodeError:
                print "Unicode Decode Error: Moving On"

            if len(tokens) != 0:
                self.token_list.append(tokens)

        file_handle.close()
Esempio n. 9
0
def SentenceBeginWithConj(sentence):
    tokens = word_tokenize(sentence)
    text = Text(tokens)
    tags = pos_tag(text)
    if tags[0][1] == "CC" or tags[0][1] == "IN":
        return True
    return False
Esempio n. 10
0
    def read_file(self, filename):
        """
        This function reads a file and returns a set of tokens back.
        :param filename: This is name of file to be read.
        """
        tokens = []

        file_handle = open(filename, "r")
        file_text = file_handle.read()

        # file_text contains the whole file.
        # This is used because the current file contents are not large
        # although the number of files are large in number.

        try:
            file_tokens = word_tokenize(file_text)
            file_tokenized_text = Text(file_tokens)
            stop_words = stopwords.words('english')

            # Clear Stop words in the tokens and special characters.
            for token in file_tokenized_text:
                lower_str = token.lower()
                if lower_str not in stop_words and re.match(regex_clear, lower_str) and len(lower_str) > 2\
                        and not(lower_str.isdigit()):
                    tokens.append(lower_str)

        except UnicodeDecodeError:
            print "Unicode Decode Error: Moving On"

        file_handle.close()
        if len(tokens) != 0:
            self.token_list.append(tokens)
Esempio n. 11
0
def get_feats_inds(text):
    t = Text(word_tokenize(text))
    g1s = [(g, True) for g in ngrams(t, 1)]
    g2s = [(g, True) for g in ngrams(t, 2)]
    #3-grams, 4-grams, and so on can also be used
    #g3s = [(g, True) for g in ngrams(t, 3)]
    return dict(g1s + g2s)
def getNLTKText(carrel):

    # configure
    MODEL = 'reader.nltk'

    # require
    from nltk import Text, word_tokenize
    from os import path, stat
    import pickle

    # initialize
    localLibrary = configuration('localLibrary')
    file = localLibrary / carrel / ETC / MODEL

    # check to see if we've previously been here
    if path.exists(file):

        # read the model
        with open(file, 'rb') as handle:
            model = pickle.load(handle)

    else:

        # create the model and save it for future use
        corpus = localLibrary / carrel / ETC / CORPUS
        model = Text(word_tokenize(open(corpus).read()))
        with open(file, 'wb') as handle:
            pickle.dump(model, handle)

    # return the model
    return (model)
Esempio n. 13
0
def concordance(carrel, query, width, lines):
    """A poor man's search engine.
	
	Given a query, this subcommand will search <carrel> and return a list of results where each result is a set of words to the left of query, the query, and a set of words to the right of query -- a keyword-in-context index. This is useful for answering the question, "What words are used in the same breath as the given word?" The query can be a phrase. Consider creating a word cloud from the output of this command to visualize the "words used in the same breath". 
	
	Examples:
	
	\b
	  rdr concordance homer -q hector
	  rdr concordance homer -q 'hector was'

	See also: rdr ngrams --help"""

    # require
    from nltk import Text, word_tokenize

    # sanity checks
    checkForCarrel(carrel)
    checkForPunkt()

    # initialize, read, and normalize; ought to save the result for future use
    localLibrary = configuration('localLibrary')
    corpus = localLibrary / carrel / ETC / CORPUS
    text = Text(word_tokenize(open(corpus).read()))

    # split query into a list, conditionally
    if ' ' in query: query = query.split(' ')

    # do the work and output
    lines = text.concordance_list(query, width=width, lines=lines)
    for line in lines:
        click.echo(line.line)
Esempio n. 14
0
 def __init__(self, corpus, old_theme_words, new_theme_words):
     """Initialize parser"""
     print("Initializing themes.")
     textified_corpus = Text(word.lower() for word in corpus.words())
     print("Finished textifying corpus.")
     self._old_theme = Theme.Theme(textified_corpus, old_theme_words)
     self._new_theme = Theme.Theme(textified_corpus, new_theme_words)
Esempio n. 15
0
def get_feats_counts(text):
    t = Text(word_tokenize(text))
    g1s = [(g, count) for g, count in FreqDist(ngrams(t, 1)).items()]
    g2s = [(g, count) for g, count in FreqDist(ngrams(t, 2)).items()]
    #3-grams, 4-grams, and so on can also be used
    #g3s = [(g, count) for g, count in FreqDist(ngrams(t, 3)).items()]
    return dict(g1s + g2s)
Esempio n. 16
0
def SentenceBeginWithPronoun(sentence):
    tokens = word_tokenize(sentence)
    text = Text(tokens)
    tags = pos_tag(text)
    if tags[0][1] == "PRP" or tags[0][1] == "PRP$":
        return True
    return False
Esempio n. 17
0
    def understand_text(self, source):
        output = open(
            "Analytics_for_" + source +
            '_{:%Y_%m_%d_%H%M%S}.txt'.format(datetime.datetime.now()), "w")
        main = self.combine_articles_from_source(source)
        puncts = list(string.punctuation)
        article_tokens = word_tokenize(main)
        clean_tokens = []
        stop_words = set(stopwords.words("english"))
        # Remove punctuation and stop words
        for token in article_tokens:
            if token not in puncts and token not in stop_words and token != "'s" and token != "``" and token != "''":
                clean_tokens.append(token)

        print("************ANALYSING************")
        print(main)
        output.write(
            "#########################################################")
        output.write("#Analysis of all cached posts by " + source +
                     "         #")
        output.write(
            "#########################################################")
        output.write(
            "#                   Concatenated text:                  #")
        output.write(
            "#########################################################")
        output.write(main.encode('utf-8', 'ignore'))
        output.write(
            "#########################################################")
        output.write("\n\n")
        print("*********************************")
        output.write(
            "############Detected tokens:#############################\n\n")
        fdist = FreqDist(clean_tokens)
        print("*************STATS:*****************")
        print("Detected words: ")
        words = ""
        for key in fdist.keys():
            words += key + ", "
        print(words)
        output.write(words.encode('utf-8', 'ignore') + "\n")
        output.write(
            "\n\n#######################Top 25 words:#####################\n\n"
        )
        print("\n\n***25 Most common***:")
        for common in fdist.most_common(n=25):
            print("\"" + common[0] + "\"" + " occurances " + str(common[1]))
            output.write("\"" + common[0].encode('utf-8', 'ignore') + "\"" +
                         " occurances " + str(common[1]) + "\n")

        output.write(
            "######################COMPLETE############################")
        output.close()

        text = Text(clean_tokens)

        #    text.plot(25)

        print("************/STATS*****************")
Esempio n. 18
0
def SentenceBeginWithPrep(sentence):
    tokens = word_tokenize(sentence)
    text = Text(tokens)
    tags = pos_tag(text)
    if tags[0][1] == "IN" and tags[0][0].lower(
    ) not in subordinate_conjunction_set_one_word:
        return True
    return False
Esempio n. 19
0
def fun01():
    """fun01"""
    print gutenberg.fileids()
    # emma by jane austen
    emma = gutenberg.words('austen-emma.txt')
    # how many words it contains
    print len(emma)
    print Text(emma).concordance("surprize")
Esempio n. 20
0
def verbCounter(text):

    tokens = word_tokenize(text.lower())
    text = Text(tokens)
    tags = pos_tag(text)

    counts = Counter(tag for word, tag in tags)
    return counts
Esempio n. 21
0
def getPronounCount(sentence):
    pronoun_number = 0
    tokens = word_tokenize(sentence)
    text = Text(tokens)
    tags = pos_tag(text)
    for tag in tags:
        if tag[1] == "PRP" or tag[1] == "PRP$":
            pronoun_number = pronoun_number + 1
    return pronoun_number
Esempio n. 22
0
def generate_text_object(tokens, stopword=0):
    _tokens = []
    if stopword == 1:
        for tweet in tokens:
            _tokens.extend(tweet.get_tweet_tokens())
    else:
        for tweet in tokens:
            _tokens.extend(tweet.get_tweet_tokens())
    return Text(_tokens)
Esempio n. 23
0
def getConjunctionCount(sentence):
    conjunction_number = 0
    tokens = word_tokenize(sentence)
    text = Text(tokens)
    tags = pos_tag(text)
    for tag in tags:
        if tag[1] == "CC" or tag[1] == "IN":
            conjunction_number = conjunction_number + 1
    return conjunction_number
Esempio n. 24
0
def get_feats(text):
    tokens = word_tokenize(text)
    t = Text(tokens)
    g1s = ngrams(t, 1)
    g1s_list = [(g, True) for g in g1s]
    g2s = ngrams(t, 2)
    g2s_list = [(g, True) for g in g2s]
    gs = g1s_list + g2s_list
    return dict(gs)
Esempio n. 25
0
def getPrepositionCount(sentence):
    preposition_number = 0
    tokens = word_tokenize(sentence)
    text = Text(tokens)
    tags = pos_tag(text)
    for tag in tags:
        if tag[1] == "IN":
            preposition_number = preposition_number + 1
    return preposition_number
Esempio n. 26
0
def hasAuxiliaryVerb(sentence):
    tokens = word_tokenize(sentence)
    text = Text(tokens)
    tags = pos_tag(text)
    for index, token in enumerate(tokens):
        # if sentence has "don't", "doesn't", "didn't", it must have ausiliary verb
        if token in do_not_set:
            return True

        if token in be_set:
            # are done
            if index != len(tokens) - 1 and tags[index + 1][1] == 'VBN':
                return True
            # are not done
            if index != len(tokens) - 2 and tags[index + 2][1] == 'VBN':
                return True
            # are doing
            if index != len(tokens) - 1 and tags[index + 1][1] == 'VBG':
                return True
            # are not doing
            if index != len(tokens) - 2 and tags[index + 2][1] == 'VBG':
                return True

        if token in be_not_set:
            # aren't done
            if index != len(tokens) - 1 and tags[index + 1][1] == 'VBN':
                return True
            # aren't doing
            if index != len(tokens) - 1 and tags[index + 1][1] == 'VBG':
                return True

        if token in do_set:
            if index != len(tokens) - 1 and tags[index + 1][1] == 'VB':
                return True
            if index != len(tokens) - 2 and tags[index + 2][1] == 'VB':
                return True

        if token in have_set:
            # have done
            if index != len(tokens) - 1 and tags[index + 1][1] == 'VBN':
                # has limited powers XXXXXX
                if index != len(tokens) - 2 and tags[
                        index +
                        2][1] == 'NN' or index != len(tokens) - 2 and tags[
                            index + 2][1] == 'NNS':
                    return False
                else:
                    return True
            # have not done
            if index != len(tokens) - 2 and tags[index + 2][1] == 'VBN':
                return True

    for tag in tags:
        if tag[0] == 'MD':
            return True
    return False
Esempio n. 27
0
 def process_content(self, content):
     raw = str(' ').join(content['paragraphs'])
     letters = set(lowercase)
     _tokens = []
     for w in word_tokenize(raw):
         lw = w.lower()
         if len(lw) > 2 and lw not in self.uscities and lw not in self.usstates and lw not in self.countries \
                 and lw not in self.stopwords and lw not in self.basicwords and set(lw) <= letters:
             _tokens.append(lw)
     self.texts.append(Text(_tokens))
     self.paragraph_tokens.extend(_tokens)
Esempio n. 28
0
    def get_stats(self, output_fname):
        fd = FreqDist()
        for text in self.texts:
            fd.update(set(text))

        fh = open(output_fname, 'w')
        text = Text(self.paragraph_tokens)
        fdist = FreqDist(text)
        for (w,f) in fdist.iteritems():
            print >> fh, "%s\t%i" % (w, f)
        fh.close()
Esempio n. 29
0
def get_feats_counts(text):
    tokens = word_tokenize(text)
    t = Text(tokens)
    g1s = ngrams(t, 1)
    freq1 = FreqDist(g1s)
    g1s_list = [(g, count) for g, count in freq1.items()]
    g2s = ngrams(t, 2)
    freq2 = FreqDist(g2s)
    g2s_list = [(g, count) for g, count in freq2.items()]
    gs = g1s_list + g2s_list
    return dict(gs)
def get_nouns_frequency(dict_nouns):
    """
    dict_nouns key를 key, dict_nouns value를 noun별 빈도수 dict를 value로 반환하는 dict 생산.
    :param dict_nouns: dict. get_nouns_from_topics 함수로 처리된 결과물.
    :return: dict
    """
    dict_frequency = dict()
    for category in dict_nouns.keys():
        text_vocab = Text(dict_nouns[category], name=category).vocab()
        dict_frequency[category] = OrderedDict(sorted(text_vocab.items(), key=itemgetter(1), reverse=True))

    return dict_frequency