Beispiel #1
0
def baseline(qbow, text, stopwords):
    # Collect all the candidate answers
    answers = []
    qbow = set([nltk.LancasterStemmer().stem(word) for word in qbow])
    qbow.update(set(lemmatizer(qbow)))
    print(qbow)
    for f in text:
        for sent in f:
            # A list of all the word tokens in the sentence
            sbow = get_bow(sent, stopwords)

            # stem all questions and sentences for better results
            sbow = set([nltk.LancasterStemmer().stem(word) for word in sbow])
            sbow.update(set(lemmatizer(sbow)))

            # and then add the other
            print(sbow)

            # Count the # of overlapping words between the Q and the A
            # & is the set intersection operator
            overlap = len(qbow & sbow)
            print(c.OKGREEN + "overlap: " + c.ENDC + str(overlap))

            answers.append((overlap, sent))

    # Sort the results by the first element of the tuple (i.e., the count)
    # Sort answers from smallest to largest by default, so reverse it
    answers = sorted(answers, key=operator.itemgetter(0), reverse=True)
    #print(answers)

    # Return the best answer
    best_answer = (answers[0])[1]

    return best_answer
def main():
    url = "http://www.networksciencelab.com"
    with urlopen(url) as doc:
        soup = BeautifulSoup(doc)

    links = [(link.string, link['href']) for link in soup.find_all('a')
             if link.has_attr('href')]
    # print(links)

    ls = nltk.LancasterStemmer()

    # word tokenize
    words = nltk.word_tokenize(soup.text)

    # lowercase
    words = [w.lower() for w in words]

    # remove stopwords
    words = [
        ls.stem(w) for w in words
        if w not in stopwords.words('english') and w.isalnum()
    ]

    # count frequency
    freqs = Counter(words)
    print(freqs.most_common(10))
Beispiel #3
0
def stem(tokens):

    porter = nltk.PorterStemmer()
    lancaster = nltk.LancasterStemmer()
    for t in tokens:
        if t not in stopwords:
            return (porter.stem(t))
Beispiel #4
0
def basics(file):
    f = open(file)
    raw = f.read()
    # The amount of chars in the texts
    print "chars=>%(len_raw)s" % {"len_raw": len(raw)}

    tokens = nltk.word_tokenize(raw)

    # The amount of words in the texts
    print "len_tokens=>%(len_tokens)s" % {"len_tokens": len(tokens)}

    # The number of sentences in the texts
    sent_tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')

    sents = sent_tokenizer.tokenize(raw)
    print "sentences=>%(len_sents)s" % {"len_sents": len(sents)}

    porter = nltk.PorterStemmer()

    porter_stems = [porter.stem(t) for t in tokens]
    print "porter_stems=>%(len_stems)s" % {"len_stems": len(porter_stems)}

    lancaster = nltk.LancasterStemmer()

    lancaster_stems = [lancaster.stem(t) for t in tokens]
    print "lancaster_stems=>%(len_stems)s" % {
        "len_stems": len(lancaster_stems)
    }

    wnl = nltk.WordNetLemmatizer()

    wnl_stems = [wnl.lemmatize(t) for t in tokens]
    print "WNL stems,%(len_stems)s" % {"len_stems": len(wnl_stems)}
def preproc_txt(doc, stemm):
    ''' Returns a string processed by removing a stopwords and words with the 
    length less than three character. Also, it can tokenize/lemmatize by using
    Porter and Lancaster algorithms and the Wordnet lemmatizer '''
    tokens = nltk.word_tokenize(doc)
    stpw = [
        word for word in tokens
        if word not in stopwords.words('english') and len(word) > 3
    ]
    if stemm == 1:
        lemma = nltk.WordNetLemmatizer()
        stmw = [lemma.lemmatize(word) for word in stpw]
        text = nltk.Text(stmw)
    elif stemm == 2:
        stemmer = nltk.PorterStemmer()
        stmw = [stemmer.stem(word) for word in stpw]
        text = nltk.Text(stmw)
    elif stemm == 3:
        stemmer = nltk.LancasterStemmer()
        stmw = [stemmer.stem(word) for word in stpw]
        text = nltk.Text(stmw)
    else:
        text = nltk.Text(stpw)

    pproc_txt = ' '.join(text)
    return pproc_txt
Beispiel #6
0
def tweet_obrabiarka(tweet, hashowac, stemmer):
    """
    This function is to use outside of modul, it gives complex tweet preprocessing.
    :param tweet: one tweet as a String
    :param hashowac: parameter which indidate will of using hashtags or no: "0" - no hashtags will be taking into
            account, "1" hashtags will be treated as normal words
    :param stemmer: parameter which let you choose type of stemmers: "1" -  Porter Stemmer (less agressive, words more
                    like natural words but not always), "2" - Lancaster Stemer (more agressive, words less like natural)
                    words, "3" - lemmatization (words like normal words, but more time-consuming)
    :return:
    """
    tt = tweet_tokenizator(tweet)
    rp = remove_punctuation(tt, hashowac)
    rs = remover_stopwords(rp)
    if stemmer == 1:
        ps = nl.PorterStemmer()
    if stemmer == 2:
        ps = nl.LancasterStemmer()
    if stemmer == -1:
        ps = nl.WordNetLemmatizer()
    output_list = []
    for word in rs:
        word = word.lower()
        if stemmer > 0:
            word = ps.stem(word)
        if stemmer < 0:
            word = ps.lemmatize(word)
        output_list.append(word)
    return output_list
def descendantofWords():
    words = filterTokenWords()
    lancaster = nltk.LancasterStemmer()
    descwords = []
    for w in words:
        descwords.append(lancaster.stem(w))
    return descwords
Beispiel #8
0
def stemmer(tokens):
    """
	文档词干化
	"""
    st = nltk.LancasterStemmer()
    tokens = [st.stem(word) for word in tokens]
    return tokens
Beispiel #9
0
def stem_words(words):
    stemmer = nltk.LancasterStemmer()
    stems = []
    for word in words:
        stem = stemmer.stem(word)
        stems.append(stem)
    return stems
Beispiel #10
0
def newstemmer(words, stemmer, threshold = 20):
    """A stemmer that uses Lancaster/porter stemmer plus a dictionary."""
    import pickle
    import os
    import nltk
    bncwordlist = pickle.load(open('spindle-code-master/keywords/bnc.p', 'rb'))
    bnc_commonwords = {k for (k,v) in bncwordlist.iteritems() if v > threshold}
    # if words is a raw string, tokenise it
    if type(words) == unicode or type(words) == string:
        tokens = nltk.word_tokenize(words)
    # or, if list of tokens, duplicate the list
    else:
        tokens = words
        # define stemmer based on the argument we passed in
    if stemmer == 'Lancaster':
        stemmertouse = nltk.LancasterStemmer()
    if stemmer == 'Porter':
        stemmertouse = nltk.PorterStemmer()
    # empty list of stems
    stems = []
    for w in tokens:
        # stem each word
        stem = stemmertouse.stem(w)
        # if the stem is in the bnc list
        if stem in bnc_commonwords:
            # add the stem
            stems.append(stem)
        else:
            # or else, just add the word as it was
            stems.append(w)
    return stems
def stem():

    porter = nltk.PorterStemmer()
    lancaster = nltk.LancasterStemmer()

    [porter.stem(t) for t in tokens]
    [lancaster.stem(t) for t in tokens]
Beispiel #12
0
 def __init__(self,
              pattern='\W+',
              lower=False,
              stem=False,
              stemmer_name='porter',
              pos=False,
              ngram=1):
     # RE pattern used in tokenization
     self.pattern = pattern
     # Ngram: Default = 1
     self.ngram = int(ngram)
     # Convert terms to lower case
     self.lower = lower
     # Ignore PoS and Stemmers if NLTK not installed
     if not my_nltk:
         self.pos = False
         self.stem = False
     else:
         self.pos = pos
         self.stem = stem
         self.stemmer_name = stemmer_name
         self.stemmers = {
             'lancaster': my_nltk.LancasterStemmer(),
             'porter': my_nltk.PorterStemmer()
         }
     self.frequent_terms = []
def show_word_in_context2(target_word, text, context_size=5):
    """ Better concordance searching tool: stemmer used and punctuation removed
    """
    stemmer = nltk.LancasterStemmer()

    # Target word pre-processing
    target_stem = stemmer.stem(target_word.lower())

    # Text pre-processing
    text = text.lower()
    for punct in punctuation:
        text = text.replace(punct, " ")

    # Make a bag of words, retaining order
    words = nltk.word_tokenize(text)

    # Search and print
    text_parts = []
    for word_num, word in enumerate(words):
        if stemmer.stem(word) == target_stem:
            start = max(word_num - context_size, 0)
            stop = word_num + context_size + 1
            text_parts.append(words[start:stop])

    return text_parts
def exercise30():
    words = nltk.word_tokenize(SimpleText)
    porter = nltk.PorterStemmer()
    porter = [porter.stem(word) for word in words]
    lancaster = nltk.LancasterStemmer()
    lancaster = [lancaster.stem(word) for word in words]
    print("words in lancaster not in porter : ", set(lancaster) - set(porter))
    print("words in porter not in lancaster : ", set(porter) - set(lancaster))
def stem_words(words):
    """Stem words in list of tokenized words"""
    stemmer = nltk.LancasterStemmer()
    stems = []
    for word in words:
        stem = stemmer.stem(word)
        stems.append(stem)
    return stems
Beispiel #16
0
    def lancasterStemmer(self, txtTokens):
        """
		Use Lancaster stemmer to stem a text
		@params txtTokens: the tokens of the text
		@rtype: {List}
		"""
        lancaster = nltk.LancasterStemmer()
        [lancaster.stem(t) for t in tokens]
Beispiel #17
0
def get_stem():
    """
    The Porter Stemmer is a good choice
        if you are indexing some texts
        and want to support search using alternative forms of words
    """
    porter = nltk.PorterStemmer()
    lancaster = nltk.LancasterStemmer()
    print([porter.stem(t) for t in tokens])
    print([lancaster.stem(t) for t in tokens])
Beispiel #18
0
 def get_stemming(self, type):
     if not self.__stemming_list:
         self.__stemming_list = {
             'rslps': nltk.stem.RSLPStemmer(),
             'porter': nltk.PorterStemmer(),
             'lancaster': nltk.LancasterStemmer(),
             'english': nltk.stem.snowball.EnglishStemmer(),
             'portuguese': nltk.stem.snowball.PortugueseStemmer()
         }
     return self.__stemming_list[type]
Beispiel #19
0
def ch03_30_porter_vs_lancaster():
    porter = nltk.PorterStemmer()
    lancaster = nltk.LancasterStemmer()
    tokens = [
        "When", "all", "is", "said", "and", "done", ",", "more", "is", "said",
        "than", "done", "."
    ]
    print "porter=", [porter.stem(w.lower()) for w in tokens]
    print "lancaster=", [lancaster.stem(w.lower()) for w in tokens]
    print "len(tokens)=", map(lambda token: len(token), tokens)
Beispiel #20
0
def exercise30():
    # 3.	exercise 30.  In this question, consider SimpleText for reporting your results.
    # ◑ Use the Porter Stemmer to normalize some tokenized text, calling the stemmer on each word. Do the same thing with the Lancaster Stemmer and see if you observe any differences.
    tokens = nltk.word_tokenize(SimpleText)
    porter = nltk.PorterStemmer()
    lancaster = nltk.LancasterStemmer()
    porter_list = [porter.stem(t) for t in tokens]
    lancaster_list = [lancaster.stem(t) for t in tokens]
    print("porter: ", porter_list)
    print("lancaster: ", lancaster_list)
Beispiel #21
0
def filter_funcs_for_context(ctx):
    filter_funcs = []
    if ctx.enable_casefolding:
        filter_funcs.append(CaseFolder().call)
    if ctx.enable_stopwords:
        filter_funcs.append(StopWordFilter(BASE_STOPWORDS).call)
    if ctx.remove_nonalphanumeric:
        filter_funcs.append(AlphaNumericFilter().call)
    if ctx.enable_stemming:
        filter_funcs.append(Stemmer(nltk.LancasterStemmer()).call)
    return filter_funcs
Beispiel #22
0
def word_stems(string, stemmer="lancaster"):
    words = get_words(string)
    if stemmer == "lancaster":
        lancaster = nltk.LancasterStemmer()
        return [lancaster.stem(t) for t in words]
    elif stemmer == "porter":
        porter = nltk.PorterStemmer()
        return [porter.stem(t) for t in words]
    else:
        print(
            "Stemmer '%' not recognized... Try using 'lancaster' or 'porter'.")
 def stem(self, Porter=1 == 1):
     '''
     Function: Stem all tokens.
     Porter: If true Porter stemmer is used, if not Lancaster stemmer is used.
     '''
     self.backup = self.lst_tk_lsts
     import nltk
     stemmer = nltk.LancasterStemmer()
     if Porter: stemmer = nltk.PorterStemmer()
     self.lst_tk_lsts = [[stemmer.stem(t) for t in l if type(t) == str]
                         for l in self.lst_tk_lsts]
Beispiel #24
0
def fun2():
    # NLTK中包括了一些现成的词干提取器,如果需要使用词干提取器,应该优先使用它们中的一个,而不
    # 是使用正则表达式制作自己的词干提取器,因为NLTK中的词干提取器能处理的不规则情况很广泛。Porter
    # 和Lancaster词干提取器按照它们自己的规则剥离词缀。
    raw = """DENNIS: Listen, strange women lying in ponds distributing swords
            is no basis for a system of government. Supreme executive power derives from
            a mandate from the masses, not from some farcical aquatic ceremony."""
    tokens = nltk.word_tokenize(raw)
    porter = nltk.PorterStemmer()
    lancaster = nltk.LancasterStemmer()
    print[porter.stem(t) for t in tokens]
    print[lancaster.stem(t) for t in tokens]
def do_normalization(text):
    stemmer = nltk.LancasterStemmer()
    lemmatizer = nltk.WordNetLemmatizer()

    # text = text.lower()

    text = ' '.join([stemmer.stem(s) for s in text.split(' ')])
    text = ' '.join([lemmatizer.lemmatize(s) for s in text.split(' ')])

    text = remove_punctuations(text)

    return text
Beispiel #26
0
def stemming(tokenized_text, Stemmer):
    '''
    Performs stemming with either Porter or Lancaster Stemmer
    '''
    if Stemmer == 'Porter':
        ps = nltk.PorterStemmer()

    elif Stemmer == 'Lancaster':
        ps = nltk.LancasterStemmer()

    text = [ps.stem(word) for word in tokenized_text]
    return text
Beispiel #27
0
def do_normalization(text):
    stemmer = nltk.LancasterStemmer()
    lemmatizer = nltk.WordNetLemmatizer()

    # removing this line improve the performance of the classifier after normalization
    text = text.lower()

    text = ' '.join([stemmer.stem(s) for s in text.split(' ')])
    text = ' '.join([lemmatizer.lemmatize(s) for s in text.split(' ')])

    text = remove_punctuations(text)

    return text
Beispiel #28
0
def preprocess(text):
    """
    Tokenize -> Normalize -> Stemming -> Stopping
    :param text: String, sentence
    :return: Returns preprocessed text as a list
    """

    text = text.translate(str.maketrans('', '', string.punctuation)).lower()
    words = nltk.word_tokenize(text)
    words = [nltk.LancasterStemmer().stem(word) for word in words]
    words = [word for word in words if word not in stopwords.words('english')]

    return words
Beispiel #29
0
def q_thirty():
    raw = "For god's sake, this is way too difficult. I need hints many, many hints. Argh!"
    tokens = nltk.word_tokenize(raw)
    porter = nltk.PorterStemmer()
    lancaster = nltk.LancasterStemmer()

    print "PorterStemmer :"
    for t in tokens:
        print porter.stem(t),
    print "\nLancasterStemmer :"
    for t in tokens:
        print lancaster.stem(t),
    print """
Beispiel #30
0
    def search_answer(self, cnstrd_word_syn, wd_in_sent, key_wd_idx):
        """
        This function searches the constrainted word of the question
        Parameters:
            cnstrd_word_syn (list) - the list of the synonyms of the constrainted word in the question
            wd_in_sent (list) - word tokenization text
            key_wd_idx (int) - the position of the key word in the sentence
        return: the position of the constrainted word in the sentence of the text  
        """
        porter = nltk.PorterStemmer()
        lancaster = nltk.LancasterStemmer()
        #print cnstrd_word_syn
        for cw in cnstrd_word_syn:
            cw_seperate = []
            if '_' in cw:

                cw1 = cw.split('_')[0]
                cw2 = cw.split('_')[1]
                cw_seperate = [cw1, cw2]

                cw = ' '.join(cw.split('_'))

                cw_seperate.append(cw)
            #print(cw)
            for sent in wd_in_sent[key_wd_idx:]:
                #print(cw)
                #print(cw, sent)
                #print sent
                """
                if cw_seperate:
                    for c_s in cw_seperate:
                        if porter.stem(c_s.lower()) == porter.stem(sent.lower()) or lemma(c_s) == lemma(sent): #or sent.lower() in cw.lower() or lemma():
                            print("!!!!!!!!")
                            print(cw, sent)
                            print(wd_in_sent.index(sent))
                            return wd_in_sent.index(sent)
                """
                if porter.stem(cw.lower()) == porter.stem(
                        sent.lower()) or lemma(cw) == lemma(
                            sent):  #or sent.lower() in cw.lower() or lemma():
                    #print("!!!!!!!!")
                    #print(cw, sent)
                    #print(wd_in_sent.index(sent))
                    return wd_in_sent.index(sent)
                """
                elif cw_seperate:
                    for cw_s in cw_seperate:
                        if porter.stem(cw.lower()) == porter.stem(sent.lower()) or lemma(cw) == lemma(sent):
                            return wd_in_sent.index(sent)
                """
        return None