def __init__(self, text): self.raw_text = text self.raw_text_length = len(text) self.number_of_letters = len( [x for x in self.raw_text if x.isalpha() or x.isdigit()]) self.words = StylometryExtractor.TOKENIZER.tokenize(self.raw_text) self.tokens = word_tokenize(self.raw_text) self.number_of_words = len(self.words) self.number_of_tokens = len(self.tokens) # self.text = Text(word_tokenize(self.raw_text)) self.words_frequency = FreqDist(Text(self.words)) self.tokens_frequency = FreqDist(Text(self.tokens)) self.chars_counter = FreqDist(self.raw_text) self.lemmatizer = WordNetLemmatizer() self.lemmatized_words_frequency = FreqDist( Text([self.lemmatizer.lemmatize(word) for word in self.words])) self.sentences = sent_tokenize(self.raw_text) self.number_of_sentences = len(self.sentences) self.sentence_chars = [len(sent) for sent in self.sentences] self.sentence_word_length = [ len(sent.split()) for sent in self.sentences ] self.paragraphs = [ p for p in self.raw_text.split("\n\n") if len(p) > 0 and not p.isspace() ] self.paragraph_word_length = [len(p.split()) for p in self.paragraphs] self.all_trigrams = self._all_trigrams() self.all_fourgrams = self._all_fourgrams() self.ngram_string = self._to_ngram_string() self.features = self._to_dict() self.feature_names = list(self.features.keys())
def __init__(self, file_content, author=DEFAULT_AUTHOR): self.author = author.strip() self.raw_content = file_content self.file_content = file_content.lower() self.tokens = PortugueseTextualProcessing.tokenize(self.file_content) self.text = Text(self.tokens) self.fdist = FreqDist(self.text) self.sentences = sent_tokenize(self.file_content, language='portuguese') self.sentence_chars = [len(sent) for sent in self.sentences] self.sentence_word_length = [len(sent.split()) for sent in self.sentences] self.paragraphs = [p for p in self.file_content.split("\n\n") if len(p) > 0 and not p.isspace()] self.paragraph_word_length = [len(p.split()) for p in self.paragraphs] self.punctuation = [".", ",", ";", "-", ":"] self.ner_entities = ['ABSTRACCAO', 'ACONTECIMENTO', 'COISA', 'LOCAL', 'ORGANIZACAO', 'OBRA', 'OUTRO', 'PESSOA', 'TEMPO', 'VALOR'] self.white_spaces = len(self.file_content.split(' ')) self.rich_tags = RichTags(PortugueseTextualProcessing.get_rich_tags(self.file_content), len(self.text)) self.tagged_sentences = PortugueseTextualProcessing.postag(self.tokens) self.tagfdist = FreqDist([b for [(a, b)] in self.tagged_sentences]) self.ner_tags = PortugueseTextualProcessing.ner_chunks(self.tokens) self.ner_ftags = FreqDist(self.ner_tags) self.spell = SpellChecker(language='pt') self.ROUNDING_FACTOR = 4 self.LINE_BREAKS = ['\n', '\t', '\r']
def concord(nome, arquivo): """ ?????? """ # Entrada nome_p = arquivo nome_e = str(nome) #print(nome_e,nome_p) #Para bugs # Abertura do aquivo a ser lido arquivo_e = open( "/tmp/concordancia/{arquivo}".format(arquivo=nome_e)).read() # tokenização do arquivo token = word_tokenize(arquivo_e) texto = Text(token) #texto.concordance(nome_p) # Abertura do arquivo de saida arquivo_s = open("/tmp/concordancia/saida.txt", "w") saida = concordance_2_txt(nome_p, token) for x in saida: arquivo_s.write(("%s\n") % (x)) arquivo_s.close() #Remove o arquivo do upload file_remove("concordancia", nome_e)
def meaning_analysis(self, sentence_analysis): final_str = '' tokens = word_tokenize(sentence_analysis) text = Text(tokens) tags = pos_tag(text) print(tags) nouns = "NN NNP PRP NNS".split() verbs = "VB VBD VBP VBG".split() questions_nouns = "WP WRB".split() if len(tags) == 1: tag_len = len(tags) + 1 else: tag_len = len(tags) for i in range(tag_len - 1): if tags[i][1] in verbs: if i + 1 < len(tags) and (tags[i + 1][1] in nouns or tags[i + 2][1] in nouns or tags[i + 3][1] in nouns or tags[i + 4][1] in nouns): print("its a decision") else: print("unknown decision") #print("yoo1") final_str = "decision" break elif tags[i][1] in questions_nouns: final_str = "question" break else: final_str = "question asked" return final_str
def tokenizer(): fileids = cicero.abspaths() reader = CategorizedXMLCorpusReader('/', fileids, cat_file='categories.txt') tokens = Text(reader.words(fileids)) return tokens
def get_count(self, obj: SearchResult) -> int: sentences = self.get_sentences(obj) tokens = [word_tokenize(sentence) for sentence in sentences] flat_tokens = list(itertools.chain(*tokens)) lower_case_tokens = [token.lower() for token in flat_tokens] text = Text(lower_case_tokens) return text.count(obj.search_term.lower())
def save_most_popular_words_analysis(self, most_common_quantity): articles_tokens = list() for (codex_type, _) in tqdm(self.parser.codex_urls): raw_articles_info = self.parser.sorted_articles_info[codex_type] for article_info in tqdm(raw_articles_info): text = self.parser.get_article_text_by_id(article_info.id) text = text.lower() text = self.remove_chars_from_text(text, self.spec_chars) article_tokens = word_tokenize(' '.join( self.mystem.lemmatize(text))) for stop_word in self.stop_words: while stop_word in article_tokens: article_tokens.remove(stop_word) articles_tokens.extend(article_tokens) text = Text(articles_tokens) f_dist = FreqDist(text) if os.path.exists(self.config['most_popular_words_analysis_file']): os.remove(self.config['most_popular_words_analysis_file']) with open(self.config['most_popular_words_analysis_file'], mode='w') as most_popular_words_analysis_file: most_popular_words_analysis_writer = csv.writer( most_popular_words_analysis_file, delimiter=',', quotechar='"', quoting=csv.QUOTE_MINIMAL) most_popular_words_analysis_writer.writerow( ['word', 'word_count', 'frequency']) for info in f_dist.most_common(most_common_quantity): most_popular_words_analysis_writer.writerow( [info[0], info[1], info[1] / len(articles_tokens)])
def read_text_sections(self, filename): tokens = [] file_handle = open(filename, "r") for line in file_handle: try: file_tokens = word_tokenize(line) file_tokenized_text = Text(file_tokens) stop_words = stopwords.words('english') # Clear Stop words in the tokens and special characters. for token in file_tokenized_text: lower_str = token.lower() if lower_str not in stop_words and re.match(regex_clear, lower_str) and len(lower_str) > 2\ and not(lower_str.isdigit()): tokens.append(lower_str) except UnicodeDecodeError: print "Unicode Decode Error: Moving On" if len(tokens) != 0: self.token_list.append(tokens) file_handle.close()
def SentenceBeginWithConj(sentence): tokens = word_tokenize(sentence) text = Text(tokens) tags = pos_tag(text) if tags[0][1] == "CC" or tags[0][1] == "IN": return True return False
def read_file(self, filename): """ This function reads a file and returns a set of tokens back. :param filename: This is name of file to be read. """ tokens = [] file_handle = open(filename, "r") file_text = file_handle.read() # file_text contains the whole file. # This is used because the current file contents are not large # although the number of files are large in number. try: file_tokens = word_tokenize(file_text) file_tokenized_text = Text(file_tokens) stop_words = stopwords.words('english') # Clear Stop words in the tokens and special characters. for token in file_tokenized_text: lower_str = token.lower() if lower_str not in stop_words and re.match(regex_clear, lower_str) and len(lower_str) > 2\ and not(lower_str.isdigit()): tokens.append(lower_str) except UnicodeDecodeError: print "Unicode Decode Error: Moving On" file_handle.close() if len(tokens) != 0: self.token_list.append(tokens)
def get_feats_inds(text): t = Text(word_tokenize(text)) g1s = [(g, True) for g in ngrams(t, 1)] g2s = [(g, True) for g in ngrams(t, 2)] #3-grams, 4-grams, and so on can also be used #g3s = [(g, True) for g in ngrams(t, 3)] return dict(g1s + g2s)
def getNLTKText(carrel): # configure MODEL = 'reader.nltk' # require from nltk import Text, word_tokenize from os import path, stat import pickle # initialize localLibrary = configuration('localLibrary') file = localLibrary / carrel / ETC / MODEL # check to see if we've previously been here if path.exists(file): # read the model with open(file, 'rb') as handle: model = pickle.load(handle) else: # create the model and save it for future use corpus = localLibrary / carrel / ETC / CORPUS model = Text(word_tokenize(open(corpus).read())) with open(file, 'wb') as handle: pickle.dump(model, handle) # return the model return (model)
def concordance(carrel, query, width, lines): """A poor man's search engine. Given a query, this subcommand will search <carrel> and return a list of results where each result is a set of words to the left of query, the query, and a set of words to the right of query -- a keyword-in-context index. This is useful for answering the question, "What words are used in the same breath as the given word?" The query can be a phrase. Consider creating a word cloud from the output of this command to visualize the "words used in the same breath". Examples: \b rdr concordance homer -q hector rdr concordance homer -q 'hector was' See also: rdr ngrams --help""" # require from nltk import Text, word_tokenize # sanity checks checkForCarrel(carrel) checkForPunkt() # initialize, read, and normalize; ought to save the result for future use localLibrary = configuration('localLibrary') corpus = localLibrary / carrel / ETC / CORPUS text = Text(word_tokenize(open(corpus).read())) # split query into a list, conditionally if ' ' in query: query = query.split(' ') # do the work and output lines = text.concordance_list(query, width=width, lines=lines) for line in lines: click.echo(line.line)
def __init__(self, corpus, old_theme_words, new_theme_words): """Initialize parser""" print("Initializing themes.") textified_corpus = Text(word.lower() for word in corpus.words()) print("Finished textifying corpus.") self._old_theme = Theme.Theme(textified_corpus, old_theme_words) self._new_theme = Theme.Theme(textified_corpus, new_theme_words)
def get_feats_counts(text): t = Text(word_tokenize(text)) g1s = [(g, count) for g, count in FreqDist(ngrams(t, 1)).items()] g2s = [(g, count) for g, count in FreqDist(ngrams(t, 2)).items()] #3-grams, 4-grams, and so on can also be used #g3s = [(g, count) for g, count in FreqDist(ngrams(t, 3)).items()] return dict(g1s + g2s)
def SentenceBeginWithPronoun(sentence): tokens = word_tokenize(sentence) text = Text(tokens) tags = pos_tag(text) if tags[0][1] == "PRP" or tags[0][1] == "PRP$": return True return False
def understand_text(self, source): output = open( "Analytics_for_" + source + '_{:%Y_%m_%d_%H%M%S}.txt'.format(datetime.datetime.now()), "w") main = self.combine_articles_from_source(source) puncts = list(string.punctuation) article_tokens = word_tokenize(main) clean_tokens = [] stop_words = set(stopwords.words("english")) # Remove punctuation and stop words for token in article_tokens: if token not in puncts and token not in stop_words and token != "'s" and token != "``" and token != "''": clean_tokens.append(token) print("************ANALYSING************") print(main) output.write( "#########################################################") output.write("#Analysis of all cached posts by " + source + " #") output.write( "#########################################################") output.write( "# Concatenated text: #") output.write( "#########################################################") output.write(main.encode('utf-8', 'ignore')) output.write( "#########################################################") output.write("\n\n") print("*********************************") output.write( "############Detected tokens:#############################\n\n") fdist = FreqDist(clean_tokens) print("*************STATS:*****************") print("Detected words: ") words = "" for key in fdist.keys(): words += key + ", " print(words) output.write(words.encode('utf-8', 'ignore') + "\n") output.write( "\n\n#######################Top 25 words:#####################\n\n" ) print("\n\n***25 Most common***:") for common in fdist.most_common(n=25): print("\"" + common[0] + "\"" + " occurances " + str(common[1])) output.write("\"" + common[0].encode('utf-8', 'ignore') + "\"" + " occurances " + str(common[1]) + "\n") output.write( "######################COMPLETE############################") output.close() text = Text(clean_tokens) # text.plot(25) print("************/STATS*****************")
def SentenceBeginWithPrep(sentence): tokens = word_tokenize(sentence) text = Text(tokens) tags = pos_tag(text) if tags[0][1] == "IN" and tags[0][0].lower( ) not in subordinate_conjunction_set_one_word: return True return False
def fun01(): """fun01""" print gutenberg.fileids() # emma by jane austen emma = gutenberg.words('austen-emma.txt') # how many words it contains print len(emma) print Text(emma).concordance("surprize")
def verbCounter(text): tokens = word_tokenize(text.lower()) text = Text(tokens) tags = pos_tag(text) counts = Counter(tag for word, tag in tags) return counts
def getPronounCount(sentence): pronoun_number = 0 tokens = word_tokenize(sentence) text = Text(tokens) tags = pos_tag(text) for tag in tags: if tag[1] == "PRP" or tag[1] == "PRP$": pronoun_number = pronoun_number + 1 return pronoun_number
def generate_text_object(tokens, stopword=0): _tokens = [] if stopword == 1: for tweet in tokens: _tokens.extend(tweet.get_tweet_tokens()) else: for tweet in tokens: _tokens.extend(tweet.get_tweet_tokens()) return Text(_tokens)
def getConjunctionCount(sentence): conjunction_number = 0 tokens = word_tokenize(sentence) text = Text(tokens) tags = pos_tag(text) for tag in tags: if tag[1] == "CC" or tag[1] == "IN": conjunction_number = conjunction_number + 1 return conjunction_number
def get_feats(text): tokens = word_tokenize(text) t = Text(tokens) g1s = ngrams(t, 1) g1s_list = [(g, True) for g in g1s] g2s = ngrams(t, 2) g2s_list = [(g, True) for g in g2s] gs = g1s_list + g2s_list return dict(gs)
def getPrepositionCount(sentence): preposition_number = 0 tokens = word_tokenize(sentence) text = Text(tokens) tags = pos_tag(text) for tag in tags: if tag[1] == "IN": preposition_number = preposition_number + 1 return preposition_number
def hasAuxiliaryVerb(sentence): tokens = word_tokenize(sentence) text = Text(tokens) tags = pos_tag(text) for index, token in enumerate(tokens): # if sentence has "don't", "doesn't", "didn't", it must have ausiliary verb if token in do_not_set: return True if token in be_set: # are done if index != len(tokens) - 1 and tags[index + 1][1] == 'VBN': return True # are not done if index != len(tokens) - 2 and tags[index + 2][1] == 'VBN': return True # are doing if index != len(tokens) - 1 and tags[index + 1][1] == 'VBG': return True # are not doing if index != len(tokens) - 2 and tags[index + 2][1] == 'VBG': return True if token in be_not_set: # aren't done if index != len(tokens) - 1 and tags[index + 1][1] == 'VBN': return True # aren't doing if index != len(tokens) - 1 and tags[index + 1][1] == 'VBG': return True if token in do_set: if index != len(tokens) - 1 and tags[index + 1][1] == 'VB': return True if index != len(tokens) - 2 and tags[index + 2][1] == 'VB': return True if token in have_set: # have done if index != len(tokens) - 1 and tags[index + 1][1] == 'VBN': # has limited powers XXXXXX if index != len(tokens) - 2 and tags[ index + 2][1] == 'NN' or index != len(tokens) - 2 and tags[ index + 2][1] == 'NNS': return False else: return True # have not done if index != len(tokens) - 2 and tags[index + 2][1] == 'VBN': return True for tag in tags: if tag[0] == 'MD': return True return False
def process_content(self, content): raw = str(' ').join(content['paragraphs']) letters = set(lowercase) _tokens = [] for w in word_tokenize(raw): lw = w.lower() if len(lw) > 2 and lw not in self.uscities and lw not in self.usstates and lw not in self.countries \ and lw not in self.stopwords and lw not in self.basicwords and set(lw) <= letters: _tokens.append(lw) self.texts.append(Text(_tokens)) self.paragraph_tokens.extend(_tokens)
def get_stats(self, output_fname): fd = FreqDist() for text in self.texts: fd.update(set(text)) fh = open(output_fname, 'w') text = Text(self.paragraph_tokens) fdist = FreqDist(text) for (w,f) in fdist.iteritems(): print >> fh, "%s\t%i" % (w, f) fh.close()
def get_feats_counts(text): tokens = word_tokenize(text) t = Text(tokens) g1s = ngrams(t, 1) freq1 = FreqDist(g1s) g1s_list = [(g, count) for g, count in freq1.items()] g2s = ngrams(t, 2) freq2 = FreqDist(g2s) g2s_list = [(g, count) for g, count in freq2.items()] gs = g1s_list + g2s_list return dict(gs)
def get_nouns_frequency(dict_nouns): """ dict_nouns key를 key, dict_nouns value를 noun별 빈도수 dict를 value로 반환하는 dict 생산. :param dict_nouns: dict. get_nouns_from_topics 함수로 처리된 결과물. :return: dict """ dict_frequency = dict() for category in dict_nouns.keys(): text_vocab = Text(dict_nouns[category], name=category).vocab() dict_frequency[category] = OrderedDict(sorted(text_vocab.items(), key=itemgetter(1), reverse=True)) return dict_frequency