Esempio n. 1
0
def demo():
    doc = TextIO_helper.read_plain_text("Resources_assets", "e960401.htm")
    spanish_tokenizer = nltk.data.load("tokenizers/punkt/spanish.pickle")
    sent_tokenize_list = spanish_tokenizer.tokenize(doc)
    print(sent_tokenize_list[10])
    spanish_postagger= StanfordPOSTagger(tagger,jar)

    post_tagged_list = spanish_postagger.tag(sent_tokenize_list[10].split())
    print(post_tagged_list)
Esempio n. 2
0
def get_normalized_articles(collection, regex, package_stopwords, stopwords,
                            package_lemma, lemmatization_dictionary):
    """
    :param collection: this is the path of the collection where we will search for the articles
    :param regex: this is the regular expression which defines where are the articles
           the group which contains the articles has to be named as "Article" (?P<Article> ----- )
    :param stopwords: this is a collection of stopwords that will be removed
    :param lemmatization_dictionary: the path of a text file which contains lemmas for a collection of words
    :return: articles: an array of articles in the collection
    """
    match = get_raw_articles(collection, regex)
    dictionaryMap = NormalizeText.create_dic_map(
        TextIO_helper.read_raw_text(package_lemma, lemmatization_dictionary))
    stopwordsdic = TextIO_helper.read_raw_text(package_stopwords,
                                               stopwords).read()
    articles = [
        NormalizeText.lemmatize_and_remove_stopwords_punctuations(
            NormalizeText.parse_html(m.group("Article")).lower(),
            dictionaryMap, stopwordsdic) for m in match
    ]
    return articles
Esempio n. 3
0
def get_most_commmon_words_in_article(corpus, regex, package_stopwords,
                                      stopwords, package_lemma,
                                      lemmatization_dictionary, n):
    """
     :param corpus: this is the path for the corpus where we will search for the articles
     :param regex:  regular expression which defines a search pattern for matching articles.
            The group which contains the articles has to be named as "Article" (?P<Article> ----- )
     :param stopwords: this is a collection of stopwords that will be removed from the collection
     :param lemmatization_dictionary: the path of a text file which contains a lemmas dictionary
     :return: most_common: an array which contains the "n" most common words in the collection articles
    """
    match = get_raw_articles(corpus, regex)
    dictionaryMap = NormalizeText.create_dic_map(
        TextIO_helper.read_raw_text(package_lemma, lemmatization_dictionary))
    stopwordsdic = TextIO_helper.read_raw_text(package_stopwords,
                                               stopwords).read()
    most_common = [
        nltk.FreqDist(
            NormalizeText.lemmatize_and_remove_stopwords_punctuations(
                NormalizeText.parse_html(m.group("Article")).lower(),
                dictionaryMap, stopwordsdic)).most_common(n) for m in match
    ]
    return most_common
Esempio n. 4
0
def demo():
    """
    This is a test of the app which separate by articles the e960404.htm corpus
    and then it returns the most common words in each article
    in this corpus we found the articles between an <article> tag and a <hr> tag
    the searched text is represented by the name group (?P<text>.*?)
    where .*? is any character and ?P<text> is the name given (article)
    """
    doc = TextIO_helper.read_raw_text("Resources_assets", "e960404.htm").read()
    rex = re.compile(r'<title.*?>(.*?)<hr>(?P<Article>.*?)<hr>', re.S | re.M)
    ##print(get_most_commmon_words_in_article(doc, rex, "Resources_assets", "stopwords_es.txt",
    ##   "Resources_assets", "lemmatization-es.txt", 5)[2])
    #print(list(get_n_article(doc, rex)).__getitem__(1).group("Article"))
    print(
        get_normalized_articles(doc, rex, "Resources_assets",
                                "stopwords_es.txt", "Resources_assets",
                                "lemmatization-es.txt")[1])
Esempio n. 5
0
def get_lemmatized_articles(collection, regex, package_lemma,
                            lemmatization_dictionary):
    """
    :param collection: the text where you want to extract the articles
    :param regex: this is the regular expression which defines where are the articles
           the group which contains the articles has to be named as "Article" (?P<Article> ----- )
    :return: collections.Iterable[__Match[T]] which contains the articles in a raw format
    """
    match = get_raw_articles(collection, regex)
    dictionaryMap = NormalizeText.create_dic_map(
        TextIO_helper.read_raw_text(package_lemma, lemmatization_dictionary))
    articles = [
        NormalizeText.lemmatize(
            NormalizeText.parse_html(m.group("Article")).lower(),
            dictionaryMap) for m in match
    ]
    return articles
Esempio n. 6
0
import codecs

import pkg_resources

from NLPUtils import TextIO_helper

if __name__ == '__main__':

    name = input()
    doc = str.lower(TextIO_helper.read_plain_text("Resources_assets", name))
    f = TextIO_helper.create_raw_text("Resources_assets", "Lower_" + name)  # opens file with name of "test.txt"
    f.writelines(doc)
    f.close()
    #print(pkg_resources.resource_filename("Resources_assets", 'e960404.htm'))
    #print(codecs.open(pkg_resources.resource_filename("Resources_assets", 'e960404.htm'), "r", encoding='utf-8', errors='ignore').read())

Esempio n. 7
0
from NLPUtils import NormalizeText

from NLPUtils import ArticleSeparator

debug = False


def printDebug(something):
    if (debug):
        print(something)


def generate_model(conditional_freqDist, word, num=15):
    for i in range(num):
        print(word)
        word = conditional_freqDist[word].max()


if __name__ == '__main__':
    '''Generate Random Text using bigrams and cond freq dist'''
    doc = TextIO_helper.read_raw_text("Resources_assets", "e960404.htm").read()
    rex = re.compile(r'<title.*?>(.*?)<hr>(?P<Article>.*?)<hr>', re.S | re.M)
    articles = NLPUtils.ArticleSeparator.get_raw_articles_list(doc, rex)
    articleTen = NormalizeText.parse_html(
        articles.__getitem__(1).group("Article"))
    articleTen = nltk.word_tokenize(articleTen)
    bigrams = nltk.bigrams(articleTen)
    condFreqDist = nltk.ConditionalFreqDist(bigrams)
    printDebug(condFreqDist['Jueves'].items())
    generate_model(condFreqDist, "Jueves")
Esempio n. 8
0
import nltk
from nltk.corpus import cess_esp as cess
from nltk import UnigramTagger as ut
from nltk import BigramTagger as bt

# Read the corpus into a list,
# each entry in the list is one sentence.
from NLPUtils import TextIO_helper
print('hi')
cess_sents = cess.tagged_sents()
print('h2')

# Train the unigram tagger
uni_tag = ut(cess_sents)
print('h3')
doc = TextIO_helper.read_plain_text("Resources_assets", "e960401.htm")
spanish_tokenizer = nltk.data.load("tokenizers/punkt/spanish.pickle")
sent_tokenize_list = spanish_tokenizer.tokenize(doc)
sentence = sent_tokenize_list[10]
print(sentence)

# Tagger reads a list of tokens.
uni_tag.tag(sentence.split(" "))

# Split corpus into training and testing set.
train = int(len(cess_sents)*90/100) # 90%

# Train a bigram tagger with only training data.
bi_tag = bt(cess_sents[:train])

# Evaluates on testing data remaining 10%
Esempio n. 9
0
def lemmatize_and_remove_stopwords_punctuations(text, dictionaryMap,stopwords):
    '''
        dictionaryMap should be a dictionary
        stopwords should be an array
        returns an array of lemmatized tokens '''
    text = RegexpTokenizer(r'\w+').tokenize(text) #removing punctuation
    text_aux = []
    for w in text:
        if w not in stopwords: #removing stopwords
            text_aux.append(get_lemma(w, dictionaryMap))
    return text_aux



def parse_html(text):
    return BeautifulSoup(text, 'html.parser').get_text()


if __name__ == '__main__':
    package = input()
    name =input()
    doc = str.lower(TextIO_helper.read_plain_text(package,name))
    dic = TextIO_helper.read_raw_text("Resources_assets","lemmatization-es.txt")
    dictionaryMap = create_dic_map(dic)
    #doc = re.sub(r"\b(\w+)\b", lambda w: get_lemma(w.group(1), dictionaryMap), doc)
    f = TextIO_helper.create_raw_text("Resources_assets","test_"+name)
    f.writelines(lemmatize_and_remove_punctuations(doc, dictionaryMap))
    print("finished")
    f.close()