Example #1
0
    def clickSearch(self):
        getoutLoop = True
        getAnswer = True
        porter = PorterStemmer()
        lancaster = LancasterStemmer()
        getText = self.textSearch.toPlainText()
        #getText = getText.lower()
        listWords2=getText.split(' ')
        #valuesBox.append(getText)
        print(len(valuesBox))
        if len(valuesBox) != 0:
            saveword = getText
            for i in range(len(valuesBox)):
                print(valuesBox)
                if len(valuesBox) == 0:
                    valuesBox.append(getText)
                    self.textTopRight.append(getText)
                

                else:
                    if valuesBox[i] == getText:
                        print('We have already search with this value')
                        getAnswer = False
                        try:
                           
                            f = pd.read_csv(r''+globalPath +getText + '.csv')
                        except IOError:
                            self.sorryMessagenullfile()

                        base_html = """
                    `   <!doctype html>
                        <html><head>
                        <meta http-equiv="Content-type" content="text/html; charset=utf-8">
                        <script type="text/javascript" src="https://ajax.googleapis.com/ajax/libs/jquery/2.2.2/jquery.min.js"></script>
                        <link rel="stylesheet" type="text/css" href="https://cdn.datatables.net/1.10.16/css/jquery.dataTables.css">
                        <script type="text/javascript" src="https://cdn.datatables.net/1.10.16/js/jquery.dataTables.js"></script>
                        </head><body>%s<script type="text/javascript">$(document).ready(function(){$('table').DataTable({
                            "pageLength": 50
                        });});</script>
                        </body></html>
                        """

                        def df_html(y):
                            """HTML table with pagination and other goodies"""
                            df_html = y.to_html()
                            return base_html % df_html

                        def df_window(x):
                            """Open dataframe in browser window using a temporary file"""
                            with NamedTemporaryFile(delete=False, suffix='.html', mode='w+', encoding='UTF8') as f:
                                f.write(df_html(x))
                            webbrowser.open(f.name)

                        michalis = pd.DataFrame(f)
                        df_window(michalis)

                        print(f)
                        print('SUCESS')
                        getoutLoop = False
                        break

            else:
                print('Not inside the list >>> ADD TO THE LIST')
                valuesBox.append(getText)
                self.textTopRight.append(getText)

        else:
            saveword = getText
            print('the list is empty')
            valuesBox.append(getText)
            self.textTopRight.append(getText)

        try:    
            
            # Here is where sql command comes in to resolve the problem of the search (ordered by type)
    
##############################################################################################################################
            if len(listWords2) >1:
                print('Leksis parapanw apo 1 character')
                dfQuery = spark.sql("Select * from netflix where title RLIKE  " + "'" + getText + "'or type RLIKE "+ "'"
                                                                                    + getText +"'or director RLIKE "+ "'" + getText +"' or cast RLIKE " + "'" +getText + "' or country RLIKE " 
                                                                                    + "'"+getText+"' or description RLIKE "+ "'" + getText +"' or duration RLIKE " + "'" +getText + "' or rating RLIKE " + "'"+getText+"'or listed_in RLIKE " + "'"+getText+"'" ) 
                print(dfQuery.collect())   

                if len(dfQuery.collect()) == 0:
                    self.sorryMessage()
                else:
                    df3 = pd.DataFrame(dfQuery.collect())
                    df3.to_csv(r''+globalPath+getText+'.csv')
                    base_html = """
                    <!doctype html>
                    <html><head>
                    <meta http-equiv="Content-type" content="text/html; charset=utf-8">
                    <script type="text/javascript" src="https://ajax.googleapis.com/ajax/libs/jquery/2.2.2/jquery.min.js"></script>
                    <link rel="stylesheet" type="text/css" href="https://cdn.datatables.net/1.10.16/css/jquery.dataTables.css">
                    <script type="text/javascript" src="https://cdn.datatables.net/1.10.16/js/jquery.dataTables.js"></script>
                    </head><body>%s<script type="text/javascript">$(document).ready(function(){$('table').DataTable({
                        "pageLength": 50
                    });});</script>
                    </body></html>
                    """

                    def df_html(y):
                        """HTML table with pagination and other goodies"""
                        df_html = y.to_html()
                        return base_html % df_html

                    def df_window(x):
                        """Open dataframe in browser window using a temporary file"""

                        with NamedTemporaryFile(delete=False, suffix='.html', mode='w+', encoding='UTF8') as f:
                            f.write(df_html(x))
                        webbrowser.open(f.name)

                    michalis2 = pd.DataFrame(df3)
                    df_window(michalis2)



###############################################################################################################################################
            else:
                print('Mono 1 leksi')
                dfQuery = spark.sql("Select * from netflix where title like" + "'% " + getText + "%' or  type like" + "'% " + getText + "%' or director like" + "'% " + getText + "%' or cast like" + "'% " + getText + "%' or country like" + "'%" + getText + "%'   or date_added like" +
                                    "'% " + getText + "%'  or release_year like" + "'% " + getText + "%'  or rating like" + "'% " + getText + "%'  or duration like" + "'% " + getText + "%'   or listed_in like" + "'% " + getText + "%' or description like" + "'% " + getText + "%' order by type")
                getText = porter.stem(getText)
                distData = spark.sql("Select * from netflix where title like" + "'% " + getText + "%' or  type like" + "'% " + getText + "%' or director like" + "'% " + getText + "%' or cast like" + "'% " + getText + "%' or country like" + "'%" + getText + "%'   or date_added like" +
                                    "'% " + getText + "%'  or release_year like" + "'% " + getText + "%'  or rating like" + "'% " + getText + "%'  or duration like" + "'% " + getText + "%'   or listed_in like" + "'% " + getText + "%' or description like" + "'% " + getText + "%' order by type")

                if (dfQuery.count() >= distData.count()): 
                    if (dfQuery.count() == 0):
                        self.sorryMessage()
                    else:
                        df3 = pd.DataFrame(dfQuery.collect())
                        df3.to_csv(r''+globalPath+saveword+'.csv', index=False)
                        #j = dfQuery.select(col("*")).collect()
                        # self.resultText.append(str(j))
                        base_html = """

                        <!doctype html>
                        <html><head>
                        <meta http-equiv="Content-type" content="text/html; charset=utf-8">
                        <script type="text/javascript" src="https://ajax.googleapis.com/ajax/libs/jquery/2.2.2/jquery.min.js"></script>
                        <link rel="stylesheet" type="text/css" href="https://cdn.datatables.net/1.10.16/css/jquery.dataTables.css">
                        <script type="text/javascript" src="https://cdn.datatables.net/1.10.16/js/jquery.dataTables.js"></script>
                        </head><body>%s<script type="text/javascript">$(document).ready(function(){$('table').DataTable({
                            "pageLength": 50
                        });});</script>
                        </body></html>
                        """

                        def df_html(y):
                            """HTML table with pagination and other goodies"""
                            df_html = y.to_html()
                            return base_html % df_html

                        def df_window(x):
                            """Open dataframe in browser window using a temporary file"""

                            with NamedTemporaryFile(delete=False, suffix='.html', mode='w+', encoding='UTF8') as f:
                                f.write(df_html(x))
                            webbrowser.open(f.name)

                        michalis2 = pd.DataFrame(df3)
                        df_window(michalis2)
                else:
                    if getAnswer == True:
                        answer = self.takeinputs(getText)
                        if answer == 'YES':
                            print('Success')
                            df2 = pd.DataFrame(distData.collect())
                            df2.to_csv(
                                r''+globalPath+saveword+'.csv')

                            base_html = """
                            <!doctype html>
                            <html><head>
                            <meta http-equiv="Content-type" content="text/html; charset=utf-8">
                            <script type="text/javascript" src="https://ajax.googleapis.com/ajax/libs/jquery/2.2.2/jquery.min.js"></script>
                            <link rel="stylesheet" type="text/css" href="https://cdn.datatables.net/1.10.16/css/jquery.dataTables.css">
                            <script type="text/javascript" src="https://cdn.datatables.net/1.10.16/js/jquery.dataTables.js"></script>
                            </head><body>%s<script type="text/javascript">$(document).ready(function(){$('table').DataTable({
                                "pageLength": 50
                            });});</script>
                            </body></html>
                            """

                            def df_html(y):
                                """HTML table with pagination and other goodies"""
                                df_html = y.to_html()
                                return base_html % df_html

                            def df_window(x):
                                """Open dataframe in browser window using a temporary file"""
                                with NamedTemporaryFile(delete=False, suffix='.html', mode='w+', encoding='UTF8') as f:
                                    f.write(df_html(x))
                                webbrowser.open(f.name)

                            michalis = pd.DataFrame(df2)
                            df_window(michalis)
                        else:
                            print('Clicked NO')
                            df2 = pd.DataFrame(dfQuery.collect())
                            df2.to_csv(
                                r''+globalPath+saveword+'.csv', index=False)

                            base_html = """
                            <!doctype html>
                            <html><head>
                            <meta http-equiv="Content-type" content="text/html; charset=utf-8">
                            <script type="text/javascript" src="https://ajax.googleapis.com/ajax/libs/jquery/2.2.2/jquery.min.js"></script>
                            <link rel="stylesheet" type="text/css" href="https://cdn.datatables.net/1.10.16/css/jquery.dataTables.css">
                            <script type="text/javascript" src="https://cdn.datatables.net/1.10.16/js/jquery.dataTables.js"></script>
                            </head><body>%s<script type="text/javascript">$(document).ready(function(){$('table').DataTable({
                                "pageLength": 50
                            });});</script>
                            </body></html>
                            """

                            def df_html(y):
                                """HTML table with pagination and other goodies"""
                                df_html = y.to_html()
                                return base_html % df_html

                            def df_window(x):
                                """Open dataframe in browser window using a temporary file"""
                                with NamedTemporaryFile(delete=False, suffix='.html', mode='w+', encoding='UTF8') as f:
                                    f.write(df_html(x))
                                webbrowser.open(f.name)

                            michalis = pd.DataFrame(df2)
                            df_window(michalis)

                    else:
                        print('out of loop')

        except NotFoundError:
            print('Out of limit')
Example #2
0
#Stemming - Normalize words into its base form or root form
import os
import nltk
import nltk.corpus

#importing stemmer
#PorterStemmer
from nltk.stem import PorterStemmer
pst = PorterStemmer()

words_to_stem = ['give', 'giving', 'given', 'gave']
for words in words_to_stem:
    print(words + ':' + pst.stem(words))
#LancasterStemmer
from nltk.stem import LancasterStemmer
lst = LancasterStemmer()
for words in words_to_stem:
    print(words + ':' + lst.stem(words))
Example #3
0
def tokenize_and_normalize_text(
      text,
      wordpunct=True,
      filter_stopwords=True,
      normalizer='wordnet',
      lang='english'
  ):
  """
  Remove stopwords, bare punctuation, capitalization; lemmatize or stem words

  Parameters
  ----------
  text : string
    a single string of words and punctuation, a "text"
  filter_stopwords : boolean (default True)
    if True, filter out stopwords in nltk.corpus.stopwords
  normalizer : string or None (default 'wordnet')
    if 'wordnet', lemmatizes words
    if in ['porter', 'lancaster', 'snowball'], stems words
    if None, doesn't normalize words
  lang : string (default 'english')
    language to use for stopwords and snowball stemmer

  Returns
  -------
  norm_words : list of strings
    list of normalized words comprising text
  """

  # check input
  if not isinstance(text, basestring):
    print '**WARNING: text is not a string!'
    return None

  # check stopwords arg
  # if lang not in stopwords.fileids():
  #   print '***ERROR: lang', lang, 'not in', stopwords.fileids(), '!'
  #   return None
  stops = frozenset(stopwords)

  # toxenize words
  if wordpunct is True:
    words = wordpunct_tokenize(text.lower())
  else:
    words = word_tokenize(text.lower())

  # remove stopwords
  if filter_stopwords is True:
    good_words = (word for word in words
            if not all([char in punctuation for char in word])
            and len(word) > 0 and len(word) < 25
            and word not in stops)
  else:
    good_words = (word for word in words
            if not all([char in punctuation for char in word])
            and len(word) > 0 and len(word) < 25)

  # normalize text
  normalizers = ['wordnet', 'porter', 'lancaster', 'snowball']
  if normalizer == 'wordnet':
    lemmatizer = WordNetLemmatizer()
    norm_words = [lemmatizer.lemmatize(word) for word in good_words]
  elif normalizer in ['porter', 'lancaster', 'snowball']:
    if normalizer == 'porter':
      stemmer = PorterStemmer()
    elif normalizer == 'lancaster':
      stemmer = LancasterStemmer()
    elif normalizer == 'snowball':
      if lang not in SnowballStemmer.languages:
        print '***ERROR: lang', lang, 'not in', SnowballStemmer.languages, '!'
        return None
      stemmer == SnowballStemmer(lang)
    norm_words = [stemmer.stem(word) for word in good_words]
  elif normalizer is None:
    norm_words = good_words
  else:
    print '***ERROR: normalizer', normalizer, 'not in', normalizers, '!'
    return None

  return norm_words
Example #4
0
 def __init__(self):
     self.tok = RegexpTokenizer(r'some_regular_expression')
     self.stemmer = LancasterStemmer()
Example #5
0
# Reading from a Text File
with open('Input.txt', 'r') as text_file:
    read_data = text_file.read()

# Tokenization
"""Sentence Tokenization"""
sentTok = sent_tokenize(text)
print("Sentence Tokenization : \n", sentTok)

"""Word Tokenization"""
tokens = [word_tokenize(t) for t in sentTok]
print("Word Tokenization : \n", tokens)

# Stemming
# 1 -> LancasterStemmer
lStem = LancasterStemmer()
print("Lancaster Stemming : \n")
for tok in tokens:
    print(lStem.stem(str(tok)))

# 2 -> SnowBallStemmer
sStem = SnowballStemmer('english')
print("SnowBall Stemming : \n")
for tok in tokens:
    print(sStem.stem(str(tok)))

# 3 -> PorterStemmer
pStem = PorterStemmer()
print("Porter Stemming : \n")
for tok in tokens:
    print(pStem.stem(str(tok)))
Example #6
0
class Bayes_stemmed_lancester(Bayes_stemmed):

    STEMMER = LancasterStemmer()
Example #7
0
 def lancaster_stemmer(self, tokens):
     ls = LancasterStemmer()
     filtered_tokens = [ls.stem(token) for token in tokens]
     return filtered_tokens
Example #8
0
def create_data():
    tt = TweetTokenizer()
    ls = LancasterStemmer()
    tr = Translator()

    f1 = open('../combined_useful.json')

    parsed_json = []
    for line in f1:
        parsed_json.append(json.loads(line))

    print len(parsed_json[0]['data'])

    stop_words_file = open('twitterStopWords.txt')
    punctuation_file = open('punctuations.txt')
    stop_words = []
    punctuations = []

    for word in stop_words_file:
        stop_words.append(word.rstrip('\n'))

    for punc in punctuation_file:
        punctuations.append(punc.rstrip('\n'))

    abbrs = {
        'u': 'you',
        'n': 'and',
        'l8': 'late',
        'ur': 'your',
        'k': 'ok',
        'wer': 'where',
        'wen': 'when',
        'b': 'be',
        'y': 'why',
        'dis': 'this',
        'v': 'we',
        'plz': 'please',
        'pls': 'please',
        'thr': 'there',
        'shd': 'should',
        'iam': 'i am',
        'masikitos': 'mosquito',
        'sec': 'security',
        'reqd': 'required',
        'mgmt': 'management',
        'hyd': 'hyderabad',
        'brb': 'be right back',
        'fr': 'for',
        'prob': 'problem',
        'don': 'do not',
        'pic': 'picture',
        'ny': 'any',
        'nyc': 'nice',
        'nic': 'nice',
        'tmrw': 'tomorrow'
    }

    len1 = len(parsed_json[0]['data'])
    len2 = len1

    data = []
    for d in parsed_json[0]['data']:
        data.append(d)

    og_tweets = []
    tweets = []
    names = []
    handles = []
    hashtags = []

    labels = []
    for i, d in enumerate(data):
        temp = ""
        cnt1 = 0
        og_tweets.append(d['text'])
        names.append(d['name'])
        try:
            x = re.sub(r"http\S+", "", d['text'])
            x = re.sub(r"www\S+", "", x)
        except Exception:
            continue

        for ch in x:
            if ord(ch) > 127:
                cnt1 += 1
                continue
            if str(ch.encode('ascii', 'ignore')) in punctuations:
                temp += ' '
            else:
                temp += ch.lower()

        x = [k for k in tt.tokenize(temp)]
        tweets.append(x)

    temp_tweets = []
    for t in tweets:
        temp_t = []
        temp_hash = []
        temp_handles = []

        for w in t:
            if any(ch.isdigit() for ch in w):
                continue
            if w.startswith('@'):
                temp_handles.append(w.split('@')[1])
            elif w.startswith('#'):
                temp_hash.append(w.split('#')[1])
                for sw in splitter.split(w.split('#')[1]):
                    temp_t.append(sw)
            elif w in abbrs:
                for ab in abbrs[w].split(' '):
                    temp_t.append(ab)
            else:
                temp_t.append(w)
        temp_tweets.append(temp_t)
        hashtags.append(temp_hash)
        handles.append(temp_handles)

    tweets = temp_tweets

    new_tweets = []

    for tw in tweets:
        new_tw = []
        for word in tw:
            if word in stop_words:
                continue
            else:
                new_tw.append(word.encode('ascii', 'ignore'))
        new_tweets.append(new_tw)

    max_len = 0
    for tw in new_tweets:
        if len(tw) > max_len:
            max_len = len(tw)

    print "Training "
    w2v = Word2Vec(new_tweets,
                   size=128,
                   min_count=2,
                   iter=50,
                   negative=20,
                   window=10)
    w2v.save('relevantNetW2V_4')
    # w2v = Word2Vec.load('relevantNetW2V')

    vocab = w2v.wv.vocab

    embedded_data = []
    for i, tw in enumerate(new_tweets):
        # print labels[i]
        # print tw
        d = []
        for w in tw:
            try:
                d.append(w2v[w])
            except Exception:
                d.append(np.zeros(128))
        embedded_data.append(d)
    pickle.dump([new_tweets, names, og_tweets], open('dataset.p', 'wb'))
    return new_tweets
Example #9
0
def LancasterTokenizer(s):
    from nltk import word_tokenize
    from nltk.stem import LancasterStemmer
    stemmer = LancasterStemmer()
    return [stemmer.stem(t) for t in word_tokenize(s)]
def apply_stemming_and_lemmatize(tokens,
                                 ls=LancasterStemmer(),
                                 wnl=WordNetLemmatizer()):
    return [wnl.lemmatize(ls.stem(token)) for token in tokens]
Example #11
0
def write(filename, predictor):
    sentence = read_sentence(filename)
    for s in sentence:
        sentence_list, label_list = process_sentence(s)
        sen = mergeWords(sentence_list)
        # print(sen)

        #####assign pos#############################################3
        pos_list = []
        # truple = tree2conlltags(ne_chunk(pos_tag(word_tokenize(sen))))
        truple = tree2conlltags(ne_chunk(pos_tag(sentence_list)))
        # the truple contains word, pos, ner-label
        for item in truple:
            pos_list.append(item[1])

        ################get words lemma and stem######################
        wordnet_lemmatizer = WordNetLemmatizer()
        lemma_list = []
        for word in sentence_list:
            lemma_list.append(wordnet_lemmatizer.lemmatize(word, pos="v"))

        stem_list = []
        lancaster = LancasterStemmer()
        for word in sentence_list:
            stem_list.append(lancaster.stem(word))
        # print(stem_list)

        #####assign consituency parent pos############################
        pos_parent_list, right_sublings_list, chunk_position, left_sublings_list = parse_consituency_tree(sentence_list,
                                                                                                          predictor)
        # print("=========pos===")
        # print(len(sentence_list))
        # print(len(chunk_position))

        #####add space to each sentence############################
        sentence_list.append(" ")
        label_list.append(" ")
        pos_list.append(" ")
        pos_parent_list.append(" ")
        right_sublings_list.append(" ")
        chunk_position.append(" ")
        lemma_list.append(" ")
        stem_list.append(" ")
        left_sublings_list.append(" ")

        data = {}
        data["word"] = sentence_list
        data["label"] = label_list
        data["pos"] = pos_list
        data["chunk"] = pos_list
        data["pos_parent"] = pos_parent_list
        data["right_sublings_list"] = right_sublings_list
        data["chunk_position"] = chunk_position
        data["lemma_list"] = lemma_list
        data["stem_list"] = stem_list
        data["left_sublings_list"] = left_sublings_list
        df = pd.DataFrame(data)

        #####write features to the file############################
        to_file = filename.split(".tsv")[0]
        to_file1 = to_file + "_feature_v1" + ".tsv"
        df.to_csv(to_file1, sep='\t', index=False, header=False, encoding="utf8", mode='a')
def stem_words(words):
    """Stem words in list of tokenized words"""
    stemmer = LancasterStemmer()
    stems = [stemmer.stem(word) for word in words]
    return stems
 def __init__(self, input_df=None, classes=None, transformation="lemmatization"):
     self.df = input_df
     self.classes = classes
     self.transform = transformation
     self.stemmer = LancasterStemmer() if self.transform == "stemming" else None
def go_through():

    clean = cleaner()
    lancaster_stemmer = LancasterStemmer()
    connection = pymysql.connect(host='localhost',
                                 user='******',
                                 password='******',
                                 db='words')

    for filename in os.listdir(path):
        print(filename)
        sents = defaultdict(list)
        dotindex = filename.index('.')
        article = filename[0:dotindex]
        with codecs.open(os.path.join(path, filename),
                         'r',
                         encoding='utf-8',
                         errors='ignore') as f:
            text = f.read()
            text_array = clean.clean(text)
            for sub_sent in text_array:
                stem_temp = [
                    lancaster_stemmer.stem(item)
                    for item in word_tokenize(sub_sent)
                ]
                temp = word_tokenize(sub_sent)
                for verb in verbs:
                    verb = verb.strip().lower()
                    verb_splits = verb.split(' ')
                    if len(verb_splits) == 1:
                        if lancaster_stemmer.stem(verb) in stem_temp:
                            sents[verb].append(sub_sent)
                    else:
                        flag = 0
                        #print(verb_splits)
                        if lancaster_stemmer.stem(verb_splits[0]) in stem_temp:
                            index = stem_temp.index(
                                lancaster_stemmer.stem(verb_splits[0])
                            )  #leave a small bug since verbs can show multiple times
                            for i in range(1, len(verb_splits)):
                                if verb_splits[i] != temp[index + i]:
                                    flag = 1
                                    break
                        if flag:
                            continue
                        else:
                            sents[verb].append(sub_sent)

        with connection.cursor() as cursor:
            sql = """insert into `verbsents` (`verb`, `article`, `sents`) values (%s, %s, %s) """
            for key in sents.keys():
                try:
                    for item in sents[key]:
                        cursor.execute(sql, (key, article, item))
                except Exception as inst:
                    print(type(inst))
                    print(inst.args)
                    print(key)
                    continue
        connection.commit()

    connection.close()
Example #15
0
    def __init__(self, data, embed_model):
        self.data = data

        self.len = len(data)
        self.stemmer = LancasterStemmer()
        self.embed_model = embed_model
test = pd.read_csv(TEST_DATA_FILE)

#Stemming and lematisation
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import LancasterStemmer

corpus = []
for i in range(0, 159571):
    Conversation = train.comment_text.astype(str)
    Conversation = re.sub('[^a-zA-Z]', ' ', Conversation[i])
    Conversation = Conversation.lower()
    Conversation = Conversation.split()
    Conversation = [word for word in Conversation if not word in set(stopwords.words('english'))]
    ps = LancasterStemmer()
    Conversation = [ps.stem(word) for word in Conversation if not word in set(stopwords.words('english'))]
    Conversation = ' '.join(Conversation)
    corpus.append(Conversation)  
    
corpus1 = []
for i in range(0, 153164):
    Conversation = test.comment_text.astype(str)
    Conversation = re.sub('[^a-zA-Z]', ' ', Conversation[i])
    Conversation = Conversation.lower()
    Conversation = Conversation.split()
    Conversation = [word for word in Conversation if not word in set(stopwords.words('english'))]
    ps = LancasterStemmer()
    Conversation = [ps.stem(word) for word in Conversation if not word in set(stopwords.words('english'))]
    Conversation = ' '.join(Conversation)
    corpus1.append(Conversation) 
Example #17
0
def get_overlap(context, question, pad_length):
    stemmer = LancasterStemmer()
    q_stemmed = [stemmer.stem(token.text) for token in question]
    matches = [int(stemmer.stem(token.text) in q_stemmed) for token in context]
    matches += [0] * (pad_length - len(matches))
    return matches
Example #18
0
def string_formulation(model, feature_names, number_words, number_topics,
                       similar_words, levenshtein_distance, pub_year_one,
                       pub_year_two, bert_model, bert_tokenizer, author):
    """Formulate the search string based on the input parameters.

    Args:
        model: Return of the lda_algorithm() with the Latent
            Dirichlet Allocation (LDA) algorithm execution.
        feature_names: Dictionary generated by the bag-of-words
            representation.
        number_words: Number of words that will be broken down
            in each topic.
        number_topics: Number of topics that will be represented
            by the Latent Dirichlet Allocation (LDA).
        similar_words: Number of similar words that will be used
            to add the search string.
        levenshtein_distance: Distance of levenshtein used for
            comparison of titles.
        pub_year_one: Upper year delimiter in the search string.
        pub_year_two: Lower year delimiter in the search string.
        bert_tokenizer: BERT Tokenizer.
        bert_model: BERT Model.
        author: Name of the Object of study.

    Return:
        message: Search string to be used.
    """
    global final_similar_word

    message = "TITLE-ABS-KEY("

    if similar_words == 0:
        for topic_index, topic in enumerate(model.components_):
            message += "(\""
            message += "\" AND \"".join([
                feature_names[i]
                for i in topic.argsort()[:-number_words - 1:-1]
            ])
            message += "\")"

            if topic_index < number_topics - 1:
                message += " OR "
            else:
                message += ""

        message += ")"

        if pub_year_one != 0:
            message += " AND PUBYEAR < "
            message += str(pub_year_one)

        return message

    else:
        lancaster = LancasterStemmer()

        for topic_index, topic in enumerate(model.components_):
            counter = 0
            message += "("

            for i in topic.argsort()[:-number_words - 1:-1]:
                counter = counter + 1

                message += "(\""
                message += "\" - \"".join([feature_names[i]])

                if " " not in feature_names[i]:
                    try:
                        similar_word = enrichment_words(
                            feature_names[i], bert_model, bert_tokenizer,
                            author)

                        # Error if the word searched it's not presented in the tokens
                        if similar_word == ['error']:
                            pass
                        else:
                            stem_feature_names = lancaster.stem(
                                feature_names[i])
                            stem_similar_word = []

                            final_stem_similar_word = []
                            final_similar_word = []

                            for j in similar_word:
                                stem_similar_word.append(lancaster.stem(j))

                            for number, word in enumerate(stem_similar_word):
                                if stem_feature_names != word and Levenshtein.distance(
                                        str(stem_feature_names),
                                        str(word)) > levenshtein_distance:
                                    irrelevant = 0
                                    for k in final_stem_similar_word:
                                        if Levenshtein.distance(
                                                str(k), str(word)
                                        ) < levenshtein_distance:
                                            irrelevant = 1
                                    if irrelevant == 0:
                                        final_stem_similar_word.append(word)
                                        final_similar_word.append(
                                            similar_word[number])

                            message += "\" OR \""
                            if len(final_similar_word) < similar_words:
                                message += "\" OR \"".join(
                                    final_similar_word[m]
                                    for m in range(0, len(final_similar_word)))
                            else:
                                message += "\" OR \"".join(
                                    final_similar_word[m]
                                    for m in range(0, similar_words))

                    except Exception as e:
                        print("Exception: " + str(e))

                message += "\")"

                if counter < len(topic.argsort()[:-number_words - 1:-1]):
                    message += " AND "
                else:
                    message += ""

            message += ")"

            if topic_index < number_topics - 1:
                message += " OR "
            else:
                message += ""

        message += ")"

        if pub_year_one != 0:
            message += " AND PUBYEAR < "
            message += str(pub_year_one)

        if pub_year_two != 0:
            message += " AND PUBYEAR > "
            message += str(pub_year_two)

        return message
Example #19
0
def lancasterstemming(words):
    lancaster = LancasterStemmer()
    print("Lancaster Stemmer")
    for word in words:
        print(word, "--->", lancaster.stem(word))
Example #20
0
def generatekeywords(filepath2):
    # Taking in the file content...........
    f=open(filepath2,"r")
    file_content=f.read()
    f.close()
    # print(file_content)
    # Taking in the file content...........



    # Collecting word list........
    word_list=nltk.word_tokenize(file_content)
    # Collecting word list........


    #POS tagging ...............
    tagged_word_list=nltk.pos_tag(word_list)
    #POS tagging ...............




    # Removing stop words..........
    stop_words=set(stopwords.words("english"))
    filtered_word_list=[]
    for i in tagged_word_list:
        if i[0] not in stop_words:
            filtered_word_list.append(i)
    # Removing stop words..........



    #stemming word........
    stemmed_word_list=[]
    for i in filtered_word_list:
        if(i[0][len(i[0])-2:]=='ly'):
            k=LancasterStemmer().stem(i[0])
            if (simobject.givesim(k,i[0])>=0.6):
                stemmed_word_list.append(i)
            else :
                stemmed_word_list.append(i)
        elif (i[0][len(i[0])-1]!='e'):
            k=PorterStemmer().stem(i[0])
            if (simobject.givesim(k,i[0])>=0.6):
                stemmed_word_list.append(i)
            else :
                stemmed_word_list.append(i)
        else:
            stemmed_word_list.append(i)
    #stemming word........



    #Lemmatising word list..............
    lemmatizer=WordNetLemmatizer()
    lemmatized_word_list=[]
    for i in stemmed_word_list:
        k=lemmatizer.lemmatize(i[0])
        if (simobject.givesim(k,i[0])>=0.6):
            lemmatized_word_list.append(i)
        else :
            lemmatized_word_list.append(i)
        
    #Lemmatising word list.............


    # Finally keeping only essentiall POS.............
    final_processed_word_list=[]
    for i in tagged_word_list:
        if(i[1]=='CD' or i[1]=='FW' or i[1]=='NN' or i[1]=='NNS' or i[1]=='NNP' or i[1]=='NNPS' or i[1]=='JJ'):

            final_processed_word_list.append(i[0])
    # Finally keeping only essentiall POS.............
    final_processed_word_list=list(OrderedDict.fromkeys(final_processed_word_list))


    return (final_processed_word_list)
Example #21
0
words = word_tokenize(txt)  # separate words
print(*words)

filtered_words = []
stopwords_in_lang = set(
    stopwords.words("english")
)  # set of words in "english" language which are not required for analysis like "and, is"
for i in words:
    if i not in stopwords_in_lang:
        filtered_words.append(i)
#print(stopwords_in_lang)                             # print stopwords in "english"
print(filtered_words)  # words without stopwords
"""SOURCE CODE for PorterStemmer @ https://tartarus.org/martin/PorterStemmer/python.txt"""

ps = PorterStemmer()  # porterstemmer algo for stemming
ls = LancasterStemmer()  # lancasterstemmer algo for stemming
ss = SnowballStemmer("english")  # snowballstemmer algo for stemming
pslist = []
lslist = []
sslist = []
for w in words:
    pslist.append(ps.stem(w))  # apply algo on each word
    lslist.append(ls.stem(w))
    sslist.append(ss.stem(w))
print(*pslist)
print(*lslist)
print(*sslist)
"""POS tag list:

CC	coordinating conjunction
CD	cardinal digit
def get_data(dataset_type):
    if dataset_type == "Text Analysis":
        df = pd.read_csv("../data/interviews/interviews_en.csv")
        df_class = pd.read_csv('../data/target.csv', index_col=0)
        df['Class'] = df_class['Class']
        # split X and y into training and testing sets
        X_train, X_test, y_train, y_test = train_test_split(df.text, df.Class, random_state=42)
        # st.write("X_train.shape : ", X_train.shape)
        # st.write("X_test.shape : ", X_test.shape)
        vect = CountVectorizer(min_df =1, stop_words='english' ,token_pattern=r'\b[a-zA-Z]+\b')
        X_train_dtm = vect.fit_transform(X_train)
        X_test_dtm = vect.transform(X_test)
        X_interview = [interview]
        X_interview_dtm = vect.transform(X_interview)
        #renaming
        X_train, X_test, X_interview = X_train_dtm, X_test_dtm, X_interview_dtm

    elif dataset_type == "Magical Text Analysis":
        df = pd.read_csv("../data/interviews/interviews_en.csv")
        #stem
        df = df[['label','text']]
        df.head(1)

        df_class = pd.read_csv('../data/target.csv', index_col=0)
        df['Class'] = df_class['Class']
        df.loc[50,:]= df.loc[49,:]
        df.loc[50,'text'] = interview

        import re, string, unicodedata
        import nltk
        # import contractions
        # import inflect
        # from bs4 import BeautifulSoup
        from nltk import word_tokenize, sent_tokenize
        from nltk.corpus import stopwords
        from nltk.stem import LancasterStemmer, WordNetLemmatizer
        from stop_words import get_stop_words
        from string import ascii_letters, digits, whitespace

        import glob
        import errno

        def tokenize(text):
            words = nltk.word_tokenize(text)
            return words
        def is_ascii(word):
            for c in word:
                if c in ascii_letters:
                    return True
            return False 
        def to_lowercase(words):
            """Convert all characters to lowercase from list of tokenized words"""
            new_words = []
            for word in words:
                new_word = word.lower()
                new_words.append(new_word)
            return new_words   
        def remove_punctuation(words):
            """Remove punctuation from list of tokenized words"""
            new_words = []
            for word in words:
                new_word = re.sub(r'[^\w\s]', '', word)
                if new_word != '':
                    new_words.append(new_word)
            return new_words

        def replace_numbers(words):
            """Replace all interger occurrences in list of tokenized words with textual representation"""
            p = inflect.engine()
            new_words = []
            for word in words:
                if word.isdigit():
                    new_word = 'число_' + str(word)
                    new_words.append(new_word)
                else:
                    new_words.append(word)
            return new_words        

        def remove_numbers(words):
            """Remove all interger occurrences in list of tokenized words"""
            new_words = []
            for word in words:
                if not word.isdigit():
                    new_words.append(word)
            return new_words 
        
        def remove_stopwords(words):
            """Remove stop words from list of tokenized words"""
            new_words = []
            for word in words:
                if word not in get_stop_words('bg'):
                    new_words.append(word)
            return new_words   
        def remove_empty_words(words):
            new_words = []
            for word in words:
                if word.strip():
                    new_words.append(word)
            return new_words  

        def print_words(df):
            for i, words in enumerate(df['words'], 1):
                print('Interview ' + str(i))
                print(words)     
        df['words'] = [tokenize(text) for text in df['text']]
        #print_words(df)

        df['words'] = [to_lowercase(words) for words in df['words']]
        # print_words(df)

        df['words'] = [remove_punctuation(words) for words in df['words']]
        # print_words(df)

        df['words'] = [remove_numbers(words) for words in df['words']]
        # print_words(df)

        df['words'] = [remove_stopwords(words) for words in df['words']]
        # print_words(df)

        df['words'] = [remove_empty_words(words) for words in df['words']]

        from nltk.stem import PorterStemmer
        from nltk.stem import LancasterStemmer
        from nltk.tokenize import word_tokenize 
           
        ps = PorterStemmer()
        ls = LancasterStemmer()
        # print_words(df)

        df_stem = pd.DataFrame()
        df_stem['words_stem_1'] = df.words.apply(lambda x : [ps.stem(word) for word in x])
        df_stem['words_stem_2'] = df.words.apply(lambda x : [ls.stem(word) for word in x])

        from gensim.corpora import Dictionary
        from gensim.models import NormModel
        from gensim.models import TfidfModel
        def tf_idf(df, attr):
            documents = df[attr]
            dictionary = Dictionary(documents)
            n_items = len(dictionary)
            #docbow converts to bag of words
            corpus = [dictionary.doc2bow(text) for text in documents]
            #then we apply tfidf 
            tfidf = TfidfModel(corpus) #fit tfidf on this corpus
            corpus_tfidf = tfidf[corpus] #transform the corpus
            
            #then make a dataframe out of it
            ds = []
            for doc in corpus_tfidf:
                d = [0] * n_items
                for index, value in doc :
                    d[index]  = value
                ds.append(d)
            df_tfidf = pd.DataFrame(ds)
            return df_tfidf   

        #we apply the tfidf on each stemmer
        df_tfidf_1 = tf_idf(df_stem, 'words_stem_1')
        df_tfidf_2 = tf_idf(df_stem, 'words_stem_2')      

        def get_headers(df, attr):
            documents = df[attr]
            dictionary = Dictionary(documents)
            return list(dictionary.values())


        df_tfidf_headers_1 = get_headers(df_stem, 'words_stem_1')
        df_tfidf_headers_2 = get_headers(df_stem, 'words_stem_2')

        df_tfidf_1.columns = df_tfidf_headers_1
        df_tfidf_2.columns = df_tfidf_headers_2


        df =df_tfidf_1.copy()

        # df_class = pd.read_csv('../data/target.csv', index_col=0)
        # df['Class'] = df_class['Class']
        # df.loc[50,:]= df.loc[49,:]
        # df.loc[50,'text'] = interview

        top_1_features = ['young','ye','won', 'whatev', 'us', 'unit', 'two', 'twice', 'tough', 'strong', 'stand', 'speed', 'sofia', 'shape', 'second', 'same', 'respect', 'real', 'qualiti', 'put', 'pulev', 'prove', 'pressur', 'press', 'partner', 'over', 'out', 'otherwis', 'or', 'noth', 'need', 'motiv', 'mani', 'lot', 'look', 'kubrat', 'knock', 'keep', 'judg', 'hit', 'here', 'healthi', 'game', 'friend', 'fan', 'fact', 'everyth', 'due', 'drive', 'cours', 'coach', 'clay', 'children', 'chain', 'bulgarian', 'break', 'both', 'big', 'between', 'almost', 'achiev']
        #code blah
        df_top_1 = df.loc[:, top_1_features].copy()

        print(type(df_top_1.loc[50,:]))
        X_interview = df_top_1.loc[50,:].to_frame().T

        print("X_interview shape",X_interview.shape)


        df = df_top_1.loc[:49,:].copy()
        print("df shape",df.shape)
        df_class = pd.read_csv('../data/target.csv', index_col=0)
        # df['Class'] = df_class['Class']


        # split X and y into training and testing sets
        X_train, X_test, y_train, y_test = train_test_split(df, df_class, random_state=42)
        print("X_train_shape",X_train.shape)
        # st.write("X_train.shape : ", X_train.shape)
        # st.write("X_test.shape : ", X_test.shape)
        # vect = CountVectorizer(min_df =1, stop_words='english' ,token_pattern=r'\b[a-zA-Z]+\b')
        # X_train_dtm = vect.fit_transform(X_train)
        # X_test_dtm = vect.transform(X_test)
        # X_interview = [interview]
        # X_interview_dtm = vect.transform(X_interview)
        # #renaming
        # X_train, X_test, X_interview = X_train_dtm,X_test_dtm,X_interview_dtm

        # if dataset_type == "Text Analysis":
        #         df = pd.read_csv("../data/interviews/interviews_en.csv")
        #         df_class = pd.read_csv('../data/target.csv', index_col=0)
        #         df['Class'] = df_class['Class']
        #         # split X and y into training and testing sets
        #         X_train, X_test, y_train, y_test = train_test_split(df.text, df.Class, random_state=42)
        #         # st.write("X_train.shape : ", X_train.shape)
        #         # st.write("X_test.shape : ", X_test.shape)
        #         vect = CountVectorizer(min_df =1, stop_words='english' ,token_pattern=r'\b[a-zA-Z]+\b')
        #         X_train_dtm = vect.fit_transform(X_train)
        #         X_test_dtm = vect.transform(X_test)
        #         X_interview = [interview]
        #         X_interview_dtm = vect.transform(X_interview)
        #         #renaming
        #         X_train, X_test, X_interview = X_train_dtm, X_test_dtm, X_interview_dtm
        
    elif dataset_type == "Numerical Analysis":
        df = pd.read_csv("../data/Features_Dataset.csv")
        df_transformed, df_template = numerical_transformation(df)
        # X_interview = pd.read_csv(filename_numerical_predict)
        X_interview = pd.read_csv("../data/numerical_input_template.csv")
        X_interview_transformed = numerical_transformation(X_interview)
        X_interview_in_template = df_template.append(X_interview_transformed)
        X_interview = X_interview_in_template.fillna(0).drop('Class', axis=1)
        
        # split X and y into training and testing sets
        X_train, X_test, y_train, y_test = train_test_split(df_transformed.drop('Class', axis=1),
                                                            df_transformed.Class, random_state=42)

    else:

        df = pd.read_csv("../data/interviews/interviews_en.csv")
        df_class = pd.read_csv('../data/target.csv', index_col=0)
        df['Class'] = df_class['Class']
        # split X and y into training and testing sets
        X_train, X_test, y_train, y_test = train_test_split(df.text, df.Class, random_state=42)
        # st.write("X_train.shape : ", X_train.shape)
        # st.write("X_test.shape : ", X_test.shape)
        vect = CountVectorizer(min_df =1, stop_words='english' ,token_pattern=r'\b[a-zA-Z]+\b')
        X_train_dtm = vect.fit_transform(X_train)
        X_test_dtm = vect.transform(X_test)
        X_interview = [interview]
        X_interview_dtm = vect.transform(X_interview)
        #renaming
        X_train, X_test, X_interview = X_train_dtm,X_test_dtm,X_interview_dtm

    return X_train, X_test, X_interview, y_train, y_test
Example #23
0
print(cb.lch_similarity(ib) == 2.5389738710582761)
print(cb.lch_similarity(dog) == 0.99852883011112725)

print('==============')
print('Stemming Words')
print('==============')

# 3_1
stemmerporter = PorterStemmer()
print(stemmerporter.stem('cooking') == 'cook')
print(stemmerporter.stem('cookery') == 'cookeri')
print(stemmerporter.stem('working'))
print(stemmerporter.stem('happiness'))

# 3_2
stemmerlan = LancasterStemmer()
print(stemmerlan.stem('cooking') == 'cook')
print(stemmerlan.stem('cookery') == 'cookery')
print(stemmerlan.stem('working'))
print(stemmerlan.stem('happiness'))
print(stemmerlan.stem('achievement'))

# 3_3
stemmerregexp = RegexpStemmer('ing')
print(stemmerregexp.stem('cooking') == 'cook')
print(stemmerregexp.stem('cookery') == 'cookery')
print(stemmerregexp.stem('ingleside') == 'leside')
print(stemmerregexp.stem('working'))
print(stemmerregexp.stem('happiness'))
print(stemmerregexp.stem('pairing'))
Example #24
0
from nltk.stem import PorterStemmer, LancasterStemmer, RegexpStemmer, SnowballStemmer, WordNetLemmatizer
from src.features.process_text.patterns import get_stemming_pattern
from nltk import pos_tag
from nltk.corpus import wordnet
from src.features.process_text.tokenize import is_tokenized, merge_tokens, word_tokenize

_stemming_porter = PorterStemmer().stem

_stemming_lancaster = LancasterStemmer().stem

_stemming_regex = RegexpStemmer(get_stemming_pattern()).stem

_stemming_snowball = SnowballStemmer('english').stem

_STEMMING_DICT = {
    'porter': _stemming_porter,
    'lancaster': _stemming_lancaster,
    'regex': _stemming_regex,
    'snowball': _stemming_snowball
}


def convert_word_stem(word_token_list, stemming_id='porter'):
    """Converts words to word stem"""
    stemming = _STEMMING_DICT.get(stemming_id)
    return [stemming(word_token) for word_token in word_token_list]


# Annotate text tokens with POS tags
def pos_tag_text(text):
    def convert_tags(pos_tag):
def stemming_words(word):
	stemmer = LancasterStemmer()
	word_stem = []
	for w in word:
		word_stem.append(stemmer.stem(w))
	return word_stem
Example #26
0
    :return: HTML front page
    """
    return render_template('index.html')


domain = ""
text = ""
punctuation = set(string.punctuation)
stop_words = set(stopwords.words("english"))
imp_keywords = set()
imp_words = set()
imp_words_list_database = []
imp_words_list_vocabulary = []

lem = WordNetLemmatizer()  # It will find the Root form i.e Verb of a word
lancaster = LancasterStemmer(
)  # It will convert the word into its stemed form i.e. Root form


def remove_stopwords_puntuation(words):
    """
    Description : This function will accept the list of words as a input.
                  Remove stop_words and puntuations from it. and returns the final list.

    Input : List of words
    Output : List of words without stop_words and puntuations.
    """
    tokanize_words = [
        word.lower() for word in words
        if word not in stop_words and word not in punctuation
    ]
    return tokanize_words
Example #27
0
# -*- coding: utf-8 -*-
from __future__ import division, print_function, unicode_literals
import re, os
import time
import numpy as np
from nltk.stem import WordNetLemmatizer
from nltk.stem import LancasterStemmer
from nltk.tokenize import word_tokenize, sent_tokenize
from sklearn.feature_extraction.text import TfidfVectorizer
import pickle
import sys
reload(sys)
sys.setdefaultencoding("ISO-8859-1")

WnL = WordNetLemmatizer()  # Lemmatizer instance
LS = LancasterStemmer()  # Stemmer instance


def load_stop_words(path):
    """ return stop words in a python list """
    with open(path, 'rb') as f:
        stopwords = f.read()
    return stopwords.split('\r\n')


def load_docs_ap(path):
    """ read ap.txt file"""
    begin = False
    docs = []
    num_docs = 0
    with open(path, 'r') as f:
Example #28
0
from nltk.stem import PorterStemmer, LancasterStemmer, SnowballStemmer, RegexpStemmer
from nltk.stem.snowball import EnglishStemmer

stemmer = PorterStemmer()
print(stemmer.stem('cooking'))
print(stemmer.stem('cookery'))

stemmer2 = LancasterStemmer()
print(stemmer2.stem('cooking'))
print(stemmer2.stem('cookery'))

stemmer3 = SnowballStemmer('english')
print(stemmer3.stem('cooking'))
print(stemmer3.stem('cookery'))

# english is also Porter.
stemmer_en = EnglishStemmer()
print(stemmer_en.stem('cooking'))
print(stemmer_en.stem('cookery'))

# regex
stemmer_reg = RegexpStemmer('ing')
print(stemmer_reg.stem('cooking'))
print(stemmer_reg.stem('thing'))
Example #29
0
@author: alberto
"""
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.stem import LancasterStemmer
from sklearn.metrics.pairwise import cosine_similarity
import networkx as nx
import re
import pickle
import requests
from bs4 import BeautifulSoup

stemmer = LancasterStemmer()


class Summarizer:
    
    def __init__(self, tfidf_vector, text=None, url=None, stemming=True):
        self.tfidf_vector = tfidf_vector
        self.title = None
        self.url = url
        
        if text is not None:
            self.text = text
            self.paragraphs = [line for line in self.text.split('\n') if len(line) > 10]
            
        elif url is not None:
            self.paragraphs, self.title = self.scrape_website()
Example #30
0
def test():
    print('Running some nltk tests...')

    text = "In Brazil they drive on the right-hand side of the road. " \
           "Brazil has a large coastline on the eastern side of South America."
    print(f'Tokenization Input: {text}')
    token = word_tokenize(text)
    print(f'Tokenization Result: {token}')
    # finding the frequency distinct in the tokens
    print(f'Top 10 Frequency Distribution: {FreqDist(token).most_common(10)}')

    print('Starting Stemming tests...')
    print('Stemmer Type: Porter =>', end='  ')
    pst = PorterStemmer()
    stm = ["waited", "waiting", "waits"]
    for word in stm:
        print(word + ":" + pst.stem(word), end='  ')
    print('...Complete!')
    print('Stemmer Type: Lancaster =>', end='  ')
    lst = LancasterStemmer()
    stm = ["giving", "given", "given", "gave"]
    for word in stm:
        print(word + ":" + lst.stem(word), end="  ")
    print('...Complete!')

    print('Starting Lemmatizer Tests...')
    lemmatizer = WordNetLemmatizer()
    print(f"rocks:{lemmatizer.lemmatize('rocks')}", end='  ')
    print(f"corpora:{lemmatizer.lemmatize('corpora')}", end='  ')
    print('...Complete!')

    print('Stop Words...')
    from nltk.corpus import stopwords
    a = set(stopwords.words('english'))
    original = "Cristiano Ronaldo was born on February 5, 1985, in Funchal, Madeira, Portugal."
    tokens = word_tokenize(original.lower())
    sw = " ".join([x for x in tokens if x not in a])
    print(f'Original Text: {original}')
    print(f'Removed Stop Words: {sw}')

    print('Part of speech tagging (POS)...')
    text = "vote to choose a particular man or a group(party) to represent them in parliament"
    # Tokenize the text
    tex = word_tokenize(text)
    for token in tex:
        print(nltk.pos_tag([token]))

    print('Named entity recognition...')
    text = "Google’s CEO Sundar Pichai introduced the new Pixel at Minnesota Roi Centre Event"
    # tokenize and POS Tagging before doing chunk
    token = word_tokenize(text)
    tags = nltk.pos_tag(token)
    chunk = ne_chunk(tags)
    print(chunk)

    print('Chunking...')
    text = "We saw the yellow dog"
    token = word_tokenize(text)
    tags = nltk.pos_tag(token)
    reg = "NP: { < DT >? < JJ > * < NN >}"
    a = nltk.RegexpParser(reg)
    result = a.parse(tags)
    print(result)