def demo(): doc = TextIO_helper.read_plain_text("Resources_assets", "e960401.htm") spanish_tokenizer = nltk.data.load("tokenizers/punkt/spanish.pickle") sent_tokenize_list = spanish_tokenizer.tokenize(doc) print(sent_tokenize_list[10]) spanish_postagger= StanfordPOSTagger(tagger,jar) post_tagged_list = spanish_postagger.tag(sent_tokenize_list[10].split()) print(post_tagged_list)
def get_normalized_articles(collection, regex, package_stopwords, stopwords, package_lemma, lemmatization_dictionary): """ :param collection: this is the path of the collection where we will search for the articles :param regex: this is the regular expression which defines where are the articles the group which contains the articles has to be named as "Article" (?P<Article> ----- ) :param stopwords: this is a collection of stopwords that will be removed :param lemmatization_dictionary: the path of a text file which contains lemmas for a collection of words :return: articles: an array of articles in the collection """ match = get_raw_articles(collection, regex) dictionaryMap = NormalizeText.create_dic_map( TextIO_helper.read_raw_text(package_lemma, lemmatization_dictionary)) stopwordsdic = TextIO_helper.read_raw_text(package_stopwords, stopwords).read() articles = [ NormalizeText.lemmatize_and_remove_stopwords_punctuations( NormalizeText.parse_html(m.group("Article")).lower(), dictionaryMap, stopwordsdic) for m in match ] return articles
def get_most_commmon_words_in_article(corpus, regex, package_stopwords, stopwords, package_lemma, lemmatization_dictionary, n): """ :param corpus: this is the path for the corpus where we will search for the articles :param regex: regular expression which defines a search pattern for matching articles. The group which contains the articles has to be named as "Article" (?P<Article> ----- ) :param stopwords: this is a collection of stopwords that will be removed from the collection :param lemmatization_dictionary: the path of a text file which contains a lemmas dictionary :return: most_common: an array which contains the "n" most common words in the collection articles """ match = get_raw_articles(corpus, regex) dictionaryMap = NormalizeText.create_dic_map( TextIO_helper.read_raw_text(package_lemma, lemmatization_dictionary)) stopwordsdic = TextIO_helper.read_raw_text(package_stopwords, stopwords).read() most_common = [ nltk.FreqDist( NormalizeText.lemmatize_and_remove_stopwords_punctuations( NormalizeText.parse_html(m.group("Article")).lower(), dictionaryMap, stopwordsdic)).most_common(n) for m in match ] return most_common
def demo(): """ This is a test of the app which separate by articles the e960404.htm corpus and then it returns the most common words in each article in this corpus we found the articles between an <article> tag and a <hr> tag the searched text is represented by the name group (?P<text>.*?) where .*? is any character and ?P<text> is the name given (article) """ doc = TextIO_helper.read_raw_text("Resources_assets", "e960404.htm").read() rex = re.compile(r'<title.*?>(.*?)<hr>(?P<Article>.*?)<hr>', re.S | re.M) ##print(get_most_commmon_words_in_article(doc, rex, "Resources_assets", "stopwords_es.txt", ## "Resources_assets", "lemmatization-es.txt", 5)[2]) #print(list(get_n_article(doc, rex)).__getitem__(1).group("Article")) print( get_normalized_articles(doc, rex, "Resources_assets", "stopwords_es.txt", "Resources_assets", "lemmatization-es.txt")[1])
def get_lemmatized_articles(collection, regex, package_lemma, lemmatization_dictionary): """ :param collection: the text where you want to extract the articles :param regex: this is the regular expression which defines where are the articles the group which contains the articles has to be named as "Article" (?P<Article> ----- ) :return: collections.Iterable[__Match[T]] which contains the articles in a raw format """ match = get_raw_articles(collection, regex) dictionaryMap = NormalizeText.create_dic_map( TextIO_helper.read_raw_text(package_lemma, lemmatization_dictionary)) articles = [ NormalizeText.lemmatize( NormalizeText.parse_html(m.group("Article")).lower(), dictionaryMap) for m in match ] return articles
import codecs import pkg_resources from NLPUtils import TextIO_helper if __name__ == '__main__': name = input() doc = str.lower(TextIO_helper.read_plain_text("Resources_assets", name)) f = TextIO_helper.create_raw_text("Resources_assets", "Lower_" + name) # opens file with name of "test.txt" f.writelines(doc) f.close() #print(pkg_resources.resource_filename("Resources_assets", 'e960404.htm')) #print(codecs.open(pkg_resources.resource_filename("Resources_assets", 'e960404.htm'), "r", encoding='utf-8', errors='ignore').read())
from NLPUtils import NormalizeText from NLPUtils import ArticleSeparator debug = False def printDebug(something): if (debug): print(something) def generate_model(conditional_freqDist, word, num=15): for i in range(num): print(word) word = conditional_freqDist[word].max() if __name__ == '__main__': '''Generate Random Text using bigrams and cond freq dist''' doc = TextIO_helper.read_raw_text("Resources_assets", "e960404.htm").read() rex = re.compile(r'<title.*?>(.*?)<hr>(?P<Article>.*?)<hr>', re.S | re.M) articles = NLPUtils.ArticleSeparator.get_raw_articles_list(doc, rex) articleTen = NormalizeText.parse_html( articles.__getitem__(1).group("Article")) articleTen = nltk.word_tokenize(articleTen) bigrams = nltk.bigrams(articleTen) condFreqDist = nltk.ConditionalFreqDist(bigrams) printDebug(condFreqDist['Jueves'].items()) generate_model(condFreqDist, "Jueves")
import nltk from nltk.corpus import cess_esp as cess from nltk import UnigramTagger as ut from nltk import BigramTagger as bt # Read the corpus into a list, # each entry in the list is one sentence. from NLPUtils import TextIO_helper print('hi') cess_sents = cess.tagged_sents() print('h2') # Train the unigram tagger uni_tag = ut(cess_sents) print('h3') doc = TextIO_helper.read_plain_text("Resources_assets", "e960401.htm") spanish_tokenizer = nltk.data.load("tokenizers/punkt/spanish.pickle") sent_tokenize_list = spanish_tokenizer.tokenize(doc) sentence = sent_tokenize_list[10] print(sentence) # Tagger reads a list of tokens. uni_tag.tag(sentence.split(" ")) # Split corpus into training and testing set. train = int(len(cess_sents)*90/100) # 90% # Train a bigram tagger with only training data. bi_tag = bt(cess_sents[:train]) # Evaluates on testing data remaining 10%
def lemmatize_and_remove_stopwords_punctuations(text, dictionaryMap,stopwords): ''' dictionaryMap should be a dictionary stopwords should be an array returns an array of lemmatized tokens ''' text = RegexpTokenizer(r'\w+').tokenize(text) #removing punctuation text_aux = [] for w in text: if w not in stopwords: #removing stopwords text_aux.append(get_lemma(w, dictionaryMap)) return text_aux def parse_html(text): return BeautifulSoup(text, 'html.parser').get_text() if __name__ == '__main__': package = input() name =input() doc = str.lower(TextIO_helper.read_plain_text(package,name)) dic = TextIO_helper.read_raw_text("Resources_assets","lemmatization-es.txt") dictionaryMap = create_dic_map(dic) #doc = re.sub(r"\b(\w+)\b", lambda w: get_lemma(w.group(1), dictionaryMap), doc) f = TextIO_helper.create_raw_text("Resources_assets","test_"+name) f.writelines(lemmatize_and_remove_punctuations(doc, dictionaryMap)) print("finished") f.close()