def make_grammatical_poem(verses, N=10, grammar=kjvdict):
    '''
    Make a poem N lines long that attempts to be
    grammatically correct given a list of verses
    and a "grammar" which is a frequency distribution
    of POS bigrams. The verses are assumed to be already
    tokenized.
    '''
    
    initial = random.choice(verses)
    poem = [initial]
    i = 0

    while i < N:
        
        initial_tags = nltk.pos_tag(initial)
        final_tag = initial_tags[-1][1]
        next_tag = weighted_choice(grammar.get(final_tag, 'NN'))
        random.shuffle(verses)
        
        for j in range(len(verses)):
            next_verse = verses[j]
            if nltk.pos_tag(next_verse)[0][1] == next_tag:
                poem.append(next_verse)
                initial = next_verse
                i += 1
                break
        else:
            print "Could not find matching lines to finish poem."
        
    return poem
Example #2
0
def colocation(windowSize, pos, context,dictionary):
    if windowSize<=0:
        return dictionary
    #going forward
    forward= context[:(pos)]
    f= forward[(-windowSize/2):]
    #going backward    
    backward= context[pos+1:]
    b= backward[:windowSize/2]
    for item in f:
        key= "pre"+str(len(f)-f.index(item))+"-word"
        value= item
        dictionary[key]=value
        key= "pre"+str(len(f)-f.index(item))+"-pos"
        text = nltk.word_tokenize(item)
        value= nltk.pos_tag(text)[0][1]
        dictionary[key]=value
    for item in b:
        key= "fol"+str(b.index(item)+1)+"-word"
        value= item
        dictionary[key]=value
        key= "fol"+str(b.index(item)+1)+"-pos"
        text = nltk.word_tokenize(item)
        value= nltk.pos_tag(text)[0][1]
        dictionary[key]=value
    return dictionary
Example #3
0
    def update(self, other):
        """Adds counts for elements in other"""
        if isinstance(other, self.__class__):
            self.n_sents += other.n_sents
            for x, n in other.items():
                self[x] += n
        else:
            for sent in other:
                self.n_sents += 1

                # import pdb;pdb.set_trace()
                if self.poscache is not None:
                    if sent in self.poscache:
                        tags = self.poscache[sent]
                    else:
                        self.poscache[sent] = tags = nltk.pos_tag(
                            nltk.word_tokenize(sent))
                else:
                    tags = nltk.pos_tag(nltk.word_tokenize(sent))

                for x in tags:
                    tok, tag = x
                    self[tag] += 1

            if self.normalize:
                for x, n in self.items():
                    self[x] /= float(self.n_sents)
Example #4
0
def create_synonyms(orig_word):
    '''
    funation for creating synonyms by passing word
    '''
    try:
        headers = {
            "X-Mashape-Key": "aIder4iWr4msh5Scn073WRoddmAEp1qA0I3jsnSR8lfJwtyzpg",
            "Accept": "application/json"}

        response = requests.get("https://wordsapiv1.p.mashape.com/words/{}/synonyms".format(orig_word), headers=headers)
        if response.status_code == 200:
            json = response.json()
            synonyms = json['synonyms']
            # synonyms = nltk.word_tokenize(synonyms)
            synonyms = nltk.pos_tag(synonyms)
            word = nltk.word_tokenize(orig_word)
            word = nltk.pos_tag(word)[0]
            print(synonyms)
            good_syns = []
            for syn in synonyms:
                print(word[1], syn[1])
                if word[1] == syn[1]:
                    print('*')
                    good_syns.append(syn[0])
            word = Word.objects.get_or_create(word=orig_word)            
            for syn in good_syns[:2]:
                try:
                    new_word = Word.objects.create(word=syn.lower(), is_synonym=True)
                except Exception:
                    new_word = Word.objects.get(word=word)
                syn = Synonym.objects.create(word=new_word)
                syn.synonym_to.add(word)
            return good_syns
    except Exception as e:
        print(e)
Example #5
0
def writeOut(lsummary_out, allwordsphrases=[],  outputpath='.', gridset=''):    
 
    # Write data out for the last folder (gridset) encountered - MUST BE A BETTER WAY THAN THIS?
    uWordsPhrases = uniqueSet(allwordsphrases)              # Set of unique words.
    uwords =[]
    uphrases = []
    words = []
    phrases =[]
    wordtypes =[]
    wordtypes =[]
    total_wordsphrases = total_uwordsphrases = total_words = total_phrases = 0

    ldata_out = UnicodeWriter(open(outputpath + '/'+ gridset +'/language-data.csv', 'wb'), delimiter=',', quotechar='"')
    ldata_out.writerow(["WORD", "NUMBER OF WORDS", "COUNT", "TYPE"])
    
   # Output metrics  to file.
    for item in uWordsPhrases:
       num_words = len(item.split())
       item_count = allwordsphrases.count(item)
       if num_words == 1:                          # Single word
          word_type = nltk.pos_tag(item)[-1][-1]
          #word_type_help = nltk.help.upenn_tagset(word_type)
# MAYBE CONVERT TAGS INTO MORE USEFUL WORDS?!
          ldata_out.writerow([item, str(num_words), str(item_count), word_type])
          uwords.append(item)
          wordtypes.append(word_type)
       elif num_words > 1:                         # Phrase
          nltk_words = nltk.word_tokenize(item)
          word_pos = nltk.pos_tag(nltk_words) ### HOW TO DEAL WITH PHRASES???
          word_types = [x[1] for x in word_pos]
          ldata_out.writerow([item, str(num_words), str(item_count), " ,".join(word_types)])
# HOW TO OUTPUT EACH POS TO A COLUMN???
          uphrases.append(item)

    for item in allwordsphrases:
        num_words = len(item.split())
        if num_words == 1:
            words.append(item)
        elif num_words > 1:
            phrases.append(item)
        
    uword_types = countDuplicatesInList(wordtypes)
    
    total_wordsphrases = len(allwordsphrases)
    total_uwordsphrases = len(uWordsPhrases)
    total_uwords = len(uwords)
    total_uphrases = len(uphrases)

    total_words = len(words)
    total_phrases = len(phrases)
    
    #["File Name", "Total Words or Phrases", "Total Unique Words or Phrases", "Total Words", "Total Phrases", "Total Unique Words", "Total Unique Phrases", "Types of Word"])
    lsummary_out.writerow([gridset, str(total_wordsphrases), str(total_uwordsphrases), str(total_words), str(total_phrases), str(total_uwords), str(total_uphrases), ', '.join(map(str, uword_types))])

    raw_words_out = open(outputpath + '/'+ gridset +'/raw-unique-words.text', 'wb')
    raw_words_out.writelines('\n'.join(uWordsPhrases).encode('utf-8'))
    raw_phrases_out = open(outputpath + '/'+ gridset +'/raw-unique-phrases.txt', 'wb')
    raw_phrases_out.writelines('\n'.join(uphrases).encode('utf-8'))
    raw_words_out = open(outputpath + '/'+ gridset +'/raw-wordsphrases.text', 'wb')
    raw_words_out.writelines('\n'.join(allwordsphrases).encode('utf-8'))
Example #6
0
def load_data(article_text):
	global tagged_words, tagged_sentences, people, sentences
	# we give parameter to load everything from file and to save some time :) 
	if "-f" in sys.argv:
	  # tokenize & tag all words in article
	  print "Tokenizing & tagging words..."
	  tokens = nltk.tokenize.wordpunct_tokenize(article_text)
	  tagged_words = nltk.pos_tag(tokens)
	  pickle.dump(tagged_words, file('tagged_words.pickle', 'w'))

	  # extract & tokenize each sentence separately
	  print "Tokenizing & tagging sentences..."
	  sentences = nltk.tokenize.sent_tokenize(article_text)
	  pickle.dump(sentences, file('sentences.pickle', 'w'))
	  
	  tokenized_sentences = [nltk.tokenize.wordpunct_tokenize(s) for s in sentences]
	  tagged_sentences = [nltk.pos_tag(s) for s in tokenized_sentences]
	  pickle.dump(tagged_sentences, file('tagged_sentences.pickle', 'w'))
	  
	  print "Searching for people..."
	  instance  = ner.NERFinder()
	  people = instance.find(tagged_words, sentences, tagged_sentences)
	  pickle.dump(people, file('people.pickle', 'w'))
	else:
	  tagged_sentences =  pickle.load(file('tagged_sentences.pickle', 'r'))
	  tagged_words =  pickle.load(file('tagged_words.pickle', 'r'))
	  sentences =  pickle.load(file('sentences.pickle', 'r'))
	  people =  pickle.load(file('people.pickle', 'r'))
def extract_pos_pair(event_mention_1, event_mention_2):
    trigger1=""
    extent1=""
    trigger2=""
    extent2=""
    for one_anchor in event_mention_1.findall("anchor"):
        trigger1=one_anchor[0].text
    for one_anchor in event_mention_2.findall("anchor"):
        trigger2=one_anchor[0].text
    for one_extent in event_mention_1.findall("extent"):
        extent1=one_extent[0].text
    for one_extent in event_mention_2.findall("extent"):
        extent2=one_extent[0].text
    text1 = nltk.word_tokenize(extent1)
    dict1 = nltk.pos_tag(text1)
    for one_pair in dict1:
        if one_pair[0] in trigger1 or trigger1 in one_pair[0]:
            pos1=one_pair[1]
            break
    text2 = nltk.word_tokenize(extent2)
    dict2 = nltk.pos_tag(text2)
    for one_pair in dict2:
        if one_pair[0] in trigger2 or trigger2 in one_pair[0]:
            pos2=one_pair[1]
            break
    return (pos1, pos2)
    def _get_sentiments(self, d):

        sent_word_net = load_sent_word_net()

        poscache_filename = "poscache.json"
        try:
            poscache = json.load(open(poscache_filename, "r"))
        except IOError:
            poscache = {}
        # http://www.ling.upenn.edu/courses/Fall_2003/ling001/penn_treebank_pos.html
        #import pdb;pdb.set_trace()
        sent = tuple(nltk.word_tokenize(d))
        #pos_tag tags tokens with part of speech (noun, verb etc)
        if poscache is not None:
            if d in poscache:
                tagged = poscache[d]
            else:
                poscache[d] = tagged = nltk.pos_tag(sent)
        else:
            tagged = nltk.pos_tag(sent)

        pos_vals = []
        neg_vals = []

        nouns = 0.
        adjectives = 0.
        verbs = 0.
        adverbs = 0.

        for w,t in tagged:
            p, n = 0,0
            sent_pos_type = None
            if t.startswith("NN"):
                sent_pos_type = "n"
                nouns += 1
            elif t.startswith("JJ"):
                sent_pos_type = "a"
                adjectives += 1
            elif t.startswith("VB"):
                sent_pos_type = "v"
                verbs += 1
            elif t.startswith("RB"):
                sent_pos_type = "r"
                adverbs += 1

            if sent_pos_type is not None:
                sent_word = "%s/%s"%(sent_pos_type, w)

                if sent_word in sent_word_net:
                    p,n = sent_word_net[sent_word]

            pos_vals.append(p)
            neg_vals.append(n)

        l = len(sent)
        avg_pos_val = np.mean(pos_vals)
        avg_neg_val = np.mean(neg_vals)
        #import pdb;pdb.set_trace()
        return [1-avg_pos_val-avg_neg_val, avg_pos_val, avg_neg_val,
                nouns/l, adjectives/l, verbs/l, adverbs/l]
Example #9
0
def load_data(path):
    sentences_pos = []
    r1 = re.compile(r'\<([^ ]+)\>')
    r2 = re.compile(r'\$US(\d)')
    for l in open(path):
        if not l.strip():
            continue
        l = l.decode('utf-8')
        l = l.replace(u'’', "'")
        l = l.replace(u'``', '"')
        l = l.replace(u"''", '"')
        l = l.replace(u"—", '--')
        l = l.replace(u"–", '--')
        l = l.replace(u"´", "'")
        l = l.replace(u"-", " ")
        l = l.replace(u"/", " ")
        l = r1.sub(r'\1', l)
        l = r2.sub(r'$\1', l)
        s = l.strip().split('\t')
        sa, sb = tuple(nltk.word_tokenize(s)
                          for s in l.strip().split('\t') if s) # ignore double \t
        sa, sb = ([x.encode('utf-8') for x in sa],
                  [x.encode('utf-8') for x in sb])

        for s in (sa, sb):
            for i in xrange(len(s)):
                if s[i] == "n't":
                    s[i] = "not"
                elif s[i] == "'m":
                    s[i] = "am"
        sa, sb = fix_compounds(sa, sb), fix_compounds(sb, sa)
        sentences_pos.append((nltk.pos_tag(sa), nltk.pos_tag(sb)))
    return sentences_pos
Example #10
0
def synsym(s1,s2):
    ts0 = nltk.pos_tag(nltk.word_tokenize(s1))
    ts1 = nltk.pos_tag(nltk.word_tokenize(s2))
    # adj  
    jj0 = [x for x,y in ts0 if y=='JJ' or y=='JJR' or y=='JJS']
    jj1 = [x for x,y in ts1 if y=='JJ' or y=='JJR' or y=='JJS']
    if len(jj0) == 0 or len(jj1) ==0:
      jjps = 0
    else: 
      v1 = makeFeatureVec(jj0,model,300)
      v2 = makeFeatureVec(jj1,model,300)
      jjps = np.inner(v1,v2)/(LA.norm(v1)*LA.norm(v2))
    # noum  
    jj0 = [x for x,y in ts0 if y=='NN' or y=='NNS' or y=='NNP' or y=='NNPS' or y=='DT']
    jj1 = [x for x,y in ts1 if y=='NN' or y=='NNS' or y=='NNP' or y=='NNPS' or y=='DT']
    if len(jj0) == 0 or len(jj1) ==0:
      nps = 0
    else: 
      v1 = makeFeatureVec(jj0,model,300)
      v2 = makeFeatureVec(jj1,model,300)
      nps =  np.inner(v1,v2)/(LA.norm(v1)*LA.norm(v2))
    # verb
    jj0 = [x for x,y in ts0 if y=='VB' or y=='VBD' or y=='VBG' or y=='VBN' or y=='VBP' or y=='VBZ']
    jj1 = [x for x,y in ts1 if y=='VB' or y=='VBD' or y=='VBG' or y=='VBN' or y=='VBP' or y=='VBZ']
    if len(jj0) == 0 or len(jj1) ==0:
      vps = 0
    else: 
      v1 = makeFeatureVec(jj0,model,300)
      v2 = makeFeatureVec(jj1,model,300)
      vps =  np.inner(v1,v2)/(LA.norm(v1)*LA.norm(v2))    
    return [jjps,nps,vps]
Example #11
0
def nltk_filter(sent):
  b1, b2 = sent.split(blockSeparator)
  b2 = b2.rstrip()

  b1            = b1.lower()
  tokens        = word_tokenize(b1)
  pos_tags      = pos_tag(tokens)
  filtered_sent = ' '
  for token in tokens:
    filtered_sent += '1'+token + ' '
  # for pos_t in pos_tags:
  #   if pos_t[1] in filterList:
  #     #filtered_sent += stemmer.stem(pos_t[0]) + ' '
  #     filtered_sent += '1' + stemmer.stem(pos_t[0]) + ' '

#note: 1 concat stemmer(word) == stemmer(1 concat word)

  b2            = b2.lower()
  tokens        = word_tokenize(b2)
  pos_tags      = pos_tag(tokens)
  # filtered_sent = ' '
  # for pos_t in pos_tags:
  #   if pos_t[1] in filterList:
  #     #filtered_sent += stemmer.stem(pos_t[0]) + ' '
  #     filtered_sent += '2' + stemmer.stem(pos_t[0]) + ' '

  for token in tokens:
    filtered_sent += '2' + token + ' '

  return filtered_sent
Example #12
0
def glv_window_overlap(t1, t2, n = 5):
    ''' Looks for an alignment within the window between sentences
        (non-overlapping within the sentence) and words
        with compatible lemmas POS.  Emits features regarding the distance between common words, and
        finds the glv vector difference between pos-tag aligned words,
        inversely weighted by sentence distance. '''
        
    ''' Looks within a window of influence around word matches for context, and compares the glove 
        vectors within the (n - 1) gram context.  Produces dim * (n - 1) dense features.'''

    features = Counter()
    v_tagged = pos_tag(leaves(t1))
    w_tagged = pos_tag(leaves(t2))

    for v in ntuples(v_tagged, n):
        for w in ntuples(w_tagged, n):
            # Find alignment
            alignments = find_exact_alignments(v, w)
            for i, j in alignments:
                ''' Featurize the word alignment in the window '''  
                features[v[i][0] + str(i - j) ] += 1
            if not alignments:
                continue
            else:
                similar_align = find_tagged_alignments(v, w, alignments)
                for i, j in similar_align:
                    word_diff = np.exp ( glvvec( v[i][0]) - glvvec( w[j][0]) ) 
                    
                    for dim in range(word_diff.shape[0]): 
                        features[ v[i][1] + ' aligned dim ' +  str(dim)] += word_diff[dim]

    return features
Example #13
0
def normalize_word(word, lowercase=True, lemmatize=True):
    "Normalize word by stripping plural nouns"
    global NORMWORD_CACHE
    global NORMWORD_POS
    if NORMWORD_WNL is None:
        init_normword_wnl()
    if lowercase:
        word = word.lower()
    if word in NORMWORD_CACHE:
        return NORMWORD_CACHE[word]
    if not lemmatize:
        return word
    treebank_tag = nltk.pos_tag([word])[0][1]
    newword = word
    if ( len(newword) > 4 ) and ( treebank_tag == 'NNS' ):
        #  Only lemmatize plural nouns, leave verbs alone
        wnpos = get_wordnet_pos(treebank_tag)
        if wnpos:
            newword = NORMWORD_WNL.lemmatize(newword, wnpos)
        if newword != word:
            LOGGER.debug('Changing %s to %s' % (word, newword))
        NORMWORD_POS[newword] = nltk.pos_tag([newword])[0][1]
    else:
        NORMWORD_POS[word] = treebank_tag
    NORMWORD_CACHE[word] = newword
    return newword
Example #14
0
 def tokenizeme(self, LanguageSample):
     self.tokenized_text=nltk.word_tokenize(LanguageSample)
     self.unique_words=list(set(self.tokenized_text))
     self.unique_words.sort()
     self.unique_words=nltk.pos_tag(self.unique_words) #Unique words does not get rid of infectional morpheme duplicates
     self.tagged_text = [i for i in nltk.pos_tag(self.tokenized_text) if i[1]!="."] #pos_tag gets the part of speech, loop removes punctuation
     self.count = len(self.tagged_text)
Example #15
0
    def replace_proper_nouns(self, o_sent, n_sent):
        proper_nouns = []
        p_pnouns = []

        o_tagged = pos_tag(word_tokenize(o_sent))
        n_tagged = pos_tag(word_tokenize(n_sent))
        # print("\nTransforming the output:")
        # print("Input sentence:", o_sent)
        # print("Found sentence:", n_sent)
        # print("Input sentence tagged:", o_tagged)
        # print("Found sentence tagged:", n_tagged)

        for o in o_tagged:
            if o[1] == 'NNP' and o not in proper_nouns:
                proper_nouns.append(o)

        for n in n_tagged:
            if (n[1] == 'PRP' or n[1] == 'PRP$' or n[1] == 'NNP') and n not in p_pnouns:
                p_pnouns.append(n)

        # print("")

        if (len(proper_nouns) == 1) and (len(p_pnouns) > 0):
            n_sent = sub(r"\b%s\b" %p_pnouns[0][0] , proper_nouns[0][0], n_sent, 1)
            gender = self.gp.classify(proper_nouns[0][0])
            # print(proper_nouns[0][0], "is classified as", gender)
            for pnoun in p_pnouns:
                n_pnoun = self.change_gender(pnoun[0], gender)
                n_sent = sub(r"\b%s\b" %pnoun[0] , n_pnoun, n_sent)
        elif len(proper_nouns) < 1:
            print("No proper nouns to replace")
        else:
            print("Not yet implemented, :P")

        return n_sent
Example #16
0
def getLemma(text, contextFlag=False):
	lemmatizer = WordNetLemmatizer()
	#'NN':wordnet.NOUN,'JJ':wordnet.ADJ,'VB':wordnet.VERB,'RB':wordnet.ADV
	wordnet_tag ={'NN':'n','JJ':'a','VB':'v','RB':'r'}
	result = None
	if text.split() == 1: # on word
		tokenized = word_tokenize(t)
		tagged = pos_tag(tokenized)[0]
		lemma = ''
		try: lemma = lemmatizer.lemmatize(tagged[0],wordnet_tag[tagged[1][:2]])
		except: lemma = lemmatizer.lemmatize(tagged[0])
		result = lemma
	elif text.split() > 1 and contextFlag == True: # mutiple words i.e. text and without considering the context
		resultList = []
		for t in text.split():
			tokenized = word_tokenize(t)
			tagged = pos_tag(tokenized)[0]
			lemma = ''
			try: lemma = lemmatizer.lemmatize(tagged[0],wordnet_tag[tagged[1][:2]])
			except: lemma = lemmatizer.lemmatize(tagged[0])
			resultList.append(lemma)
		result = ' '.join(resultList)
	else: # mutiple words i.e. text and consider the context
		resultList = []
		tokens = word_tokenize(text)
		tagged = pos_tag(tokens)
		for t in tagged:
			try: resultList.append(lemmatizer.lemmatize(t[0],wordnet_tag[t[1][:2]]))
			except: resultList.append(lemmatizer.lemmatize(t[0]))
		result = ' '.join(resultList)
	return result
Example #17
0
def expand_with_wordnet(query):
    """
    This function expands every contentful word in the query with its wordnet
    definition. The word itself is not removed. Stop words are removed from the
    word definition as well.
    (Contentful means that it is not a stopword or punctuation sign)

    INPUT:
        query   --  user query that is a simple string
    OUTPUT:
        expanded_query  --  user query + definitions of contentful words
    """
    stop = stopwords.words("english")
    stop += EXCLUDED
    contentful_tokens = [tok for tok in query.split() if tok not in stop]
    # take the first definition for the current word
    defs = []
    for token in contentful_tokens:
        syn1 = wn.synsets(token, pos=wn.ADJ)[:1]
        syn2 = wn.synsets(token, pos=wn.NOUN)[:1]
        # we take into account only adj defs
        if syn1:
            defs.append(token)
            def_tokenized = word_tokenize(syn1[0].definition())
            [defs.append(t[0]) for t in pos_tag(def_tokenized) if t[1] in ["NN", "JJ"]]
        elif syn2:
            defs.append(token)
            def_tokenized = word_tokenize(syn2[0].definition())
            [defs.append(t[0]) for t in pos_tag(def_tokenized) if t[1] in ["NN", "JJ"]]
    # expansion can add some EXCLUDED words back in the query
    defs = set(defs) - set(EXCLUDED)  # removing again
    expanded = " ".join(defs)
    return expanded
Example #18
0
def jaccard_similarity(statement, other_statement, threshold=0.5):
    """
    The Jaccard index is composed of a numerator and denominator.
    In the numerator, we count the number of items that are shared between the sets.
    In the denominator, we count the total number of items across both sets.
    Let's say we define sentences to be equivalent if 50% or more of their tokens are equivalent.
    Here are two sample sentences:

        The young cat is hungry.
        The cat is very hungry.

    When we parse these sentences to remove stopwords, we end up with the following two sets:

        {young, cat, hungry}
        {cat, very, hungry}

    In our example above, our intersection is {cat, hungry}, which has count of two.
    The union of the sets is {young, cat, very, hungry}, which has a count of four.
    Therefore, our Jaccard similarity index is two divided by four, or 50%.
    Given our threshold above, we would consider this to be  a match.
    """
    from nltk.corpus import wordnet
    import nltk
    import string

    a = statement.text
    b = other_statement.text

    # Get default English stopwords and extend with punctuation
    stopwords = nltk.corpus.stopwords.words('english')
    stopwords.extend(string.punctuation)
    stopwords.append('')
    lemmatizer = nltk.stem.wordnet.WordNetLemmatizer()

    def get_wordnet_pos(pos_tag):
        if pos_tag[1].startswith('J'):
            return (pos_tag[0], wordnet.ADJ)
        elif pos_tag[1].startswith('V'):
            return (pos_tag[0], wordnet.VERB)
        elif pos_tag[1].startswith('N'):
            return (pos_tag[0], wordnet.NOUN)
        elif pos_tag[1].startswith('R'):
            return (pos_tag[0], wordnet.ADV)
        else:
            return (pos_tag[0], wordnet.NOUN)

    ratio = 0
    pos_a = map(get_wordnet_pos, nltk.pos_tag(nltk.tokenize.word_tokenize(a)))
    pos_b = map(get_wordnet_pos, nltk.pos_tag(nltk.tokenize.word_tokenize(b)))
    lemmae_a = [lemmatizer.lemmatize(token.lower().strip(string.punctuation), pos) for token, pos in pos_a \
                    if pos == wordnet.NOUN and token.lower().strip(string.punctuation) not in stopwords]
    lemmae_b = [lemmatizer.lemmatize(token.lower().strip(string.punctuation), pos) for token, pos in pos_b \
                    if pos == wordnet.NOUN and token.lower().strip(string.punctuation) not in stopwords]

    # Calculate Jaccard similarity
    try:
        ratio = len(set(lemmae_a).intersection(lemmae_b)) / float(len(set(lemmae_a).union(lemmae_b)))
    except Exception as e:
        print('Error', e)
    return (ratio >= threshold)
Example #19
0
def demo():
    # split paragraph into sentences using punct
    sent_tokenizer = nltk.data.load('nltk:tokenizers/punkt/english.pickle')
    sents = sent_tokenizer.tokenize(paragraphs)
    
    # split sentence into tokens (wrods + puncts)
    s = "Good muffins cost $3.88\nin New York.  Please buy me\ntwo of them.\n\nThanks."
    WordPunctTokenizer().tokenize(s)
    #['Good', 'muffins', 'cost', '$', '3', '.', '88', 'in', 'New', 'York', '.', 'Please', 'buy', 'me', 'two', 'of', 'them', '.', 'Thanks', '.']
    PunktWordTokenizer().tokenize(s)
    #['Good', 'muffins', 'cost', '$3.88', 'in', 'New', 'York.', 'Please', 'buy', 'me', 'two', 'of', 'them.', 'Thanks.']
    PunktWordTokenizer().span_tokenize(s)
    #[(0, 4), (5, 12), (13, 17), (18, 23), (24, 26), (27, 30), (31, 36), (38, 44),  (45, 48), (49, 51), (52, 55), (56, 58), (59, 64), (66, 73)]
    
    #split the paragraph into sentence
    nltk.sent_tokenize(s)
    #split sentence into word and punct
    nltk.word_tokenize(s)
    
    # pos tagging
    nltk.pos_tag(nltk.word_tokenize(s))



    
    
    
    
Example #20
0
def text_to_pos_list(lst):
    dpos_list = []
    tpos_list = []
    for line in lst:
        if "IsTruthFul" in line:
            continue
        else:
            if line[0] == "0": #If deceptive:
                dpos_list.append("<r>")
                for sent in nltk.tokenize.sent_tokenize(parse_line(line)):
                    dpos_list.append("<s>")
                    text = nltk.word_tokenize(sent)
                    tagged = nltk.pos_tag(text)
                    for t in tagged:
                        dpos_list.append(t)
                    dpos_list.append("</s>")
                dpos_list.append("</r>")
            else:
                tpos_list.append("<r>")
                for sent in nltk.tokenize.sent_tokenize(parse_line(line)):
                    tpos_list.append("<s>")
                    text = nltk.word_tokenize(sent)
                    tagged = nltk.pos_tag(text)
                    for t in tagged:
                        tpos_list.append(t)
                    tpos_list.append("</s>")
                tpos_list.append("</r>")
    return (dpos_list, tpos_list)
Example #21
0
def redundant(text1, text2):
    tag1 = nltk.pos_tag(text1)
    tag2 = nltk.pos_tag(text2)
    
    l1=len(tag1)
    l2=len(tag2)
    i=0
    count = 0
    while i < l1 :
        j = 0
        #print(tag1[i])
        s1 = tag1[i]
        #print(s1[1])
        while j < l2 :
            s2 = tag2[j]
            if str(s1[1]) == str(s2[1]) and str(s1[0]) == str(s2[0]) :
                #print(s1[0]+s1[1])
                count = count + 1                
            j = j + 1
        i = i + 1
    match = 2*count / (l1 + l2)
    match = match * 100
    #print(str(count))        
    #if count > 1 :
    #print("match percent ",match,"% ")
        #return 1
    #else:
    if match > 70:
        return 1
    else:
        return 0
Example #22
0
def extract_entities2(text):
	entities = []
	
	"""t0 = nltk.DefaultTagger('NN')
	t1 = nltk.UnigramTagger(train_sents, backoff=t0)
	t2 = nltk.BigramTagger(train_sents, backoff=t1)
	t2.evaluate(test_sents)"""
	
	for sentence in sent_tokenize(text):
	    #print pos_tag(nltk.word_tokenize(sentence))
	    print sentence
	    tags=pos_tag(nltk.word_tokenize(sentence))
	    tags=tagear(tags)
	    chunks = ne_chunk(pos_tag(nltk.word_tokenize(sentence)))
	    #chunks = ne_chunk(regexp_tagger.tag((nltk.word_tokenize(text))))
	    chunks = ne_chunk(tags)
	    #chunks.draw()
	    #print chunks
	    for chunk in chunks:
	    	#print chunk
	    	#if hasattr(chunk, 'node'):
	    	#	print chunk.node
	    	if hasattr(chunk, 'node') :
	    		print chunk	
	    		entities.extend([chunk for chunk in chunks if hasattr(chunk, 'node')])
	return entities
    def test_nltkNERParsing(self):
        testString = 'Natural Sciences and Engineering Research Council of Canada'
        unigrams = TokenizeOnWhitespacePunctuation(testString, keepCaps=True).getUnigrams()
        posTagged = nltk.pos_tag(unigrams)
        chunked = nltk.ne_chunk(posTagged)
        getGPEs = []

        for treeBranch in chunked:
            if hasattr(treeBranch, 'label') and treeBranch.label() == 'GPE':
                getGPEs.append(str(treeBranch))

        self.assertEqual(1, len(getGPEs))

        testString = 'Milwaukee Foundation'
        unigrams = TokenizeOnWhitespacePunctuation(testString, keepCaps=True).getUnigrams()
        posTagged = nltk.pos_tag(unigrams)
        chunked = nltk.ne_chunk(posTagged)
        # returns (S (PERSON Milwaukee/NNP) (ORGANIZATION Foundation/NNP))

        testString = 'New England Board of Higher Education'
        unigrams = TokenizeOnWhitespacePunctuation(testString, keepCaps=True).getUnigrams()
        posTagged = nltk.pos_tag(unigrams)
        chunked = nltk.ne_chunk(posTagged)
        # returns (S (GPE New/NNP)(ORGANIZATION England/NNP Board/NNP) of/IN (PERSON Higher/NNP Education/NNP))

        testString = 'New England Board of Higher Education'
        unigrams = TokenizeOnWhitespacePunctuation(testString).getUnigrams()
        posTagged = nltk.pos_tag(unigrams)
        chunked = nltk.ne_chunk(posTagged)
Example #24
0
    def parse_stock_name(self, stockname):
        p = engine()

        instruction_set = stockname.split(',')
        word_list = instruction_set[0].split(' ')
        index = 1
        categories_ignored = ['RB', 'TO']
        tokens = word_tokenize(instruction_set[0])
        tags = pos_tag(tokens)
        i=0
        while i < len(tags):
            if tags[i][1] in categories_ignored:
                index += 1
                i+= 1
            else:
                break

        quantity = word_list[index-1]
        disallowed = ['g', 'ml', 'x', 'kg', 'cups', 'cup', 'grams', 'can', 'tbsp', 'tsp', 'tbsps', 'tsps',
                 'small', 'bunch', 'piece', 'handful', 'pack', 'chopped', 'large', 'a', 'pinch',
                 'fresh', 'dried', 'heaped', 'thick', 'slices', 'slice', 'of', 'about']
        while index < len(word_list):
            if word_list[index] not in disallowed:
                break
            else:
                index+=1
        sentence = " ".join(word_list[index:])
        tokens = word_tokenize(sentence)
        categories = pos_tag(tokens)
        words = []
        for category in categories:
            if category[1] not in ['NNS', 'VBN', 'VBG']:
                words.append(category[0])
        word = " ".join(words)
        return quantity, word, None
def test(ws,wf,s,pf,wm,alfa2):
    f1=open('test_data.data','rb')
    f2=open('test.csv','rb')
    val_text=f1.read()
    comt=f2.read().splitlines()
    val_lines=val_text.splitlines()
    acc=0
    lc=0
    for line in val_lines:
        token = line.split(' | ')
        token[2]="<S> "+token[2]+" <E>"
        t_t =token[2].split(' %% ')
        if t_t[0]!="<S> ":
            bff = nltk.pos_tag(t_t[0].split(".")[-1].split(" "))[-1][1]
        else:
            bff="<S>"
        if t_t[2]!=" <E>":
            aff = nltk.pos_tag(t_t[2].split(".")[0].split(" "))[0][1]
        else:
            aff="<E>"
        val_label = nb(ws,wf,s,token[0],pf,aff,bff,alfa2)
        if val_label==comt[lc].split(",")[1]:
            acc+=1
        lc+=1
    print float(acc)/len(val_lines)
    f1.close()
    f2.close()
Example #26
0
def score_glove_pos(src, dst, numpy_arrays, labels_array, g, normalize=True):
	b1 = []
	b2 = []
	lines = 0
	with open(src) as p:
		for i, line in enumerate(p):
			s = line.split('\t')
			b1.append(s[0])
			b2.append(s[1][:-1]) #remove \n
			lines = i + 1

	b1_pos = [nltk.pos_tag(nltk.word_tokenize(re.sub(r'[^\x00-\x7F]+',' ', text))) for text in b1]
	b2_pos = [nltk.pos_tag(nltk.word_tokenize(re.sub(r'[^\x00-\x7F]+',' ', text))) for text in b2]

	res = []
	for i in range(lines):
		tags1 = [tag[0] for tag in b1_pos[i] if tag[1] in NOUN]
		tags2 = [tag[0] for tag in b2_pos[i] if tag[1] in NOUN]
		r = [1 - spatial.distance.cosine(g[tag1], g[tag2]) for tag1 in tags1 for tag2 in tags2 if tag1 in labels_array and tag2 in labels_array]
		if len(r) == 0:
			res.append(0)
		else:
			res.append(round(5*max(r), 2))

	if normalize:
		res = normarlize_score(res)
			
	with open(dst, 'w') as thefile:
		thefile.write("\n".join(str(i) for i in res))
	print src + ' finished!'
 def m_surrounding(self):
    D = {}
    sent = self.sentence["form"]
    l = len(sent)
    #print sent 
    K = self.index
    '''
    for k in range(l):
        if sent[k] == self.word:
            K = k
            break
    '''
    #print K, l
    tagp = tagn = ""
    if (K+1) < l:
        tagn = nt.word_tokenize(sent[K+1])
        tagn = nt.pos_tag(tagn)     
    if (K-1) >=0:
        tagp = nt.word_tokenize(sent[K-1])
        tagp = nt.pos_tag(tagp)        
        
    if tagp != "":
        D["ptag"] = tagp[0][1]
    else: 
        D["ptag"] = ""
    if tagn != "":    
        D["ntag"] = tagn[0][1]
    else:
        D["ntag"] = ""
        
    print D
    return D 
Example #28
0
def printer(sentencescorelist, sentenceList, wordscorelist, wordList):
    outFile = open('./tldr/outFile.txt', 'w')
    for s in range(0, len(sentenceList)):
        if s in sentencescorelist:
            printsentence(sentenceList[s], outFile)
    outFile.write("Topics to research: ")

    topics = []
    numtopics = 3
    poswords = nltk.pos_tag(wordList)
    poskeep = ["NN", "NNS", "NNP", "NNPS"]

    while numtopics > 0:
        temp = max(wordscorelist.iteritems(), key=operator.itemgetter(1))[0]
        templist = [temp]
        templist = nltk.pos_tag(templist)
        if templist[0][1] in poskeep:
            numtopics -= 1
            topics.append(temp)
        del wordscorelist[temp]
    for i in range(0, len(topics)):
        if i != len(topics) - 1:
            outFile.write(topics[i] + ", ")
        else:
            outFile.write(topics[i])
    outFile.close()
def test(ws,wf,s,pf):
    f1=open('validation_data.data','rb')
    #f2=open('test_data.csv','w')
    val_text=f1.read()
    val_lines=val_text.splitlines()
    acc=0

    for line in val_lines:
        token = line.split(' | ')
        t_t =token[2].split(' %% ')
        if t_t[0]!="<S>":
            bff = nltk.pos_tag(t_t[0].split(".")[-1].split(" "))[-1][1]
        else:
            bff="<S>"
        if t_t[2]!="<\S>":
            aff = nltk.pos_tag(t_t[2].split(".")[0].split(" "))[0][1]
        else:
            aff="<\S>"
        val_label = nb(ws,wf,s,token[0],pf,aff,bff)
        #f2.write(token[0]+" | "+val_label+" | "+token[2])
    #f1.close()
    #f2.close()
    #print "Done"
    
        

        if val_label==token[1]:
            acc+=1
    print float(acc)/len(val_lines)
Example #30
0
        def make_pos(target_tag, edit_rev):
            tags, srcs, dsts = edit_rev

            sentence = ''

            if target_tag == del_tag:
                sentence = dsts
            elif target_tag == add_tag:
                sentence = srcs

            if target_tag in tags:
                tag_indexes = [i for i, x in enumerate(tags) if x == target_tag]
                trimed = sentence
                for tag_index in tag_indexes:
                    trimed = trimed[:tag_index] + trimed[tag_index+1:]

                posed = pos_tag(trimed)
                pos = [w[1] for w in posed]
                for tag_index in tag_indexes:
                    pos.insert(tag_index, u'')

                # debug
                none_indexes = [i for i, x in enumerate(pos) if x == u'']
                if tag_indexes != none_indexes:
                    print(tag_indexes, file=sys.stderr)
                    print(none_indexes, file=sys.stderr)
                    print(tags, file=sys.stderr)
                    print(pos, file=sys.stderr)
            else:
                posed = pos_tag(u' '.join(sentence).split())
                pos = [w[1] for w in posed]

            return pos
I am a student of UMKC.
Studying masters in CS."""
s = word_tokenize(sentence)
print(s)

print('Stemming')
from nltk.stem import PorterStemmer
from nltk.tokenize import sent_tokenize, word_tokenize

ps = PorterStemmer()
example_words = ["python", "pythoner", "pythoning", "pythoned", "pythonly"]
for w in example_words:
    print(ps.stem(w))

print('parts of speech')
pos = nltk.pos_tag(s)
print(pos)

print('lemmatization')
lemmatize = nltk.WordNetLemmatizer()
print lemmatize.lemmatize('schooling', pos='v')

from nltk.util import ngrams
from collections import Counter
print('trigrams')
trigrams = ngrams(s, 3)
print Counter(trigrams)

from nltk import pos_tag, ne_chunk
import numpy
print ne_chunk(pos_tag(wordpunct_tokenize(sentence)))
Example #32
0
def getPOS(words):
    # POS tag based feature set
    # get the parts of speech tags
    parts_of_speech = nltk.pos_tag(words)

    ret = {}
    verbCount = 0
    nounCount = 0
    properNounCount = 0
    adjCount = 0
    prpCount = 0
    word_list = []
    punctuation = [".", ",", "!", "?", ";", ":", "\'", "\""]

    for (word, pos) in parts_of_speech:
        if word not in punctuation:
            word_list.append(word)

            if 'NNP' in pos:
                properNounCount += 1
            elif 'PRP' in pos:
                prpCount += 1

            # TODO possibly get social relationships

            # simplify the POS tag
            tag = map_tag('en-ptb', 'universal', pos)
            # increment pos counters
            if "NOUN" in tag:
                nounCount += 1
            elif "ADJ" in tag:
                adjCount += 1
            elif "VERB" in tag:
                verbCount += 1

    wordCount = len(word_list)

    # record the percentages the pos
    np = 0
    ap = 0
    vp = 0

    if (wordCount > 0):
        np = nounCount / wordCount
        ap = adjCount / wordCount
        vp = verbCount / wordCount

    # check the documentation for binning explanation
    # bin the nouns and add them to dictionary
    ret["nouns"] = np
    ret["adjectives"] = ap
    ret["verbs"] = vp

    if np < .145:
        ret["noun_percentage"] = 0
    elif np < .255:
        ret["noun_percentage"] = 1
    else:
        ret["noun_percentage"] = 2

    # bin the adjectives and add them to dictionary
    if ap < .028:
        ret["adj_percentage"] = 0
    elif ap < .096:
        ret["adj_percentage"] = 1
    else:
        ret["adj_percentage"] = 2

    # bin the verbs and add them to dictionary
    if vp < .13:
        ret["verb_percentage"] = 0
    elif vp < .22:
        ret["verb_percentage"] = 1
    else:
        ret["verb_percentage"] = 2

    if (wordCount > 0):
        ret["Personal_Pronoun_Percentage"] = prpCount / wordCount
    else:
        ret["Personal_Pronoun_Percentage"] = 0

    if (nounCount > 0):
        ret["Proper_Noun_Percentage"] = properNounCount / nounCount
    else:
        ret["Proper_Noun_Percentage"] = 0

    ret["word_count"] = wordCount

    return (word_list, ret)
Example #33
0
                else:
                    reNE = reNE + "|" + listNE[NE]

            m = re.search('\((' + reNE + ')(\s)', str(namedEnt[i]))
            if m:
                typeEntity = m.group(1)
                entityList.append((entity, typeEntity))
    return [(elem[0], elem[1]) for elem in entityList]


### GET TOP N NAMED ENTITIES
all_entities = [
    get_entities(
        nltk.ne_chunk(
            nltk.pos_tag(
                nltk.word_tokenize(
                    s.encode('utf-8').decode('unicode_escape')))))
    for s in sents
]
token_left = [entity[0] for entities in all_entities for entity in entities]
print(token_left[0:100])
fdist = nltk.FreqDist(token_left)
for k in (sorted(fdist, key=fdist.__getitem__, reverse=True)[0:30]):
    if (has_wikipedia_page(k)):
        print(k, fdist[k])

#===============================================================================
# IN = re.compile(r'.*\bin\b(?!\b.+ing)')
# for doc in [nltk.ne_chunk(nltk.pos_tag(nltk.word_tokenize(s.encode('utf-8').decode('unicode_escape')))) for s in sents]:
#     for rel in nltk.sem.extract_rels('PERSON', 'LOC', doc, pattern = IN):
#         print(nltk.sem.rtuple(rel))
Example #34
0
def tags_for_sent(sentence):
    return map(change_to_wordnet_tag,
               nltk.pos_tag(nltk.word_tokenize(sentence)))
import nltk, random
from nltk.corpus import nps_chat
from nltk.corpus import brown
from nltk import word_tokenize

posts = nltk.corpus.nps_chat.xml_posts()
featuresets = [nltk.pos_tag(word_tokenize(post.text)) for post in posts]
t0 = nltk.DefaultTagger('NN')
t1 = nltk.UnigramTagger(featuresets, backoff=t0)
t2 = nltk.BigramTagger(featuresets, backoff=t1)

##text = word_tokenize("I am good");
##print(t2.tag(text));
##print(text);

from nltk.corpus import movie_reviews as movies
pos_docs = movies.fileids('pos')
neg_docs = movies.fileids('neg')
classifier_training = []

for doc in pos_docs:
    sents = movies.sents(doc)
    for sent in sents:
        tagged = t2.tag(sent)
        words = [w for w, k in tagged]
        tags = [k for w, k in tagged]
        feature = {}
        for i in range(len(words) - 1):
            feature[words[i] + ' ' +
                    words[i + 1]] = tags[i] + ' ' + tags[i + 1]
text = "I'm booking hotel"

# 1 tokenizzazione
tokens = nltk.word_tokenize(text)

#print(nltk.pos_tag(tokens))

for i in tokens:
    pass
    #print(nltk.pos_tag([i]))

# 2 noun selection (tagging), eliminazione stopwords e case lettere.
#   POSTAG funziona bene se gli passi l'insieme di token e non la singola parola
#   cap 5 parte 1
candidates_tokens = [
    token[0].lower() for token in nltk.pos_tag(tokens) if token[1][0:2] == 'NN'
    and token[0] not in nltk.corpus.stopwords.words('english')
]
#print(candidates_tokens)

# 3.1 stemming Porter     cap 3 parte 3.6   USO SOLO QUESTO
#     Uso il Porter se voglio indicizzare del testo e fare ricerca usando parole alternative.
porter = nltk.PorterStemmer()
tokens_stemmed1 = [porter.stem(token) for token in candidates_tokens]
#print(tokens_stemmed1)

# 3.2 stemming Lancaster
porter = nltk.LancasterStemmer()
tokens_stemmed2 = [porter.stem(token) for token in candidates_tokens]
#print(tokens_stemmed2)
Example #37
0
from nltk import pos_tag
from nltk.stem import PorterStemmer

from word_setting import *

print(pos_tag(tool_keywords1))
Example #38
0
                             "k10", "k11", "k12", "k13", "k14", "k15"
                         ],
                         header=0,
                         sep=',',
                         error_bad_lines=False,
                         encoding='utf-8')

#TODO later: Include emoji about weater in tokenizer ()
tokenized_text = []
#Include % amd minus for tempurature forcast figures
tokenizer = RegexpTokenizer("\w+|%|-")

#Start pre-processing
for tweet in train_data.tweets:
    #Tokenize
    tokens = tokenizer.tokenize(tweet)

    #Pos tagging
    append_pos = []
    tagged_tokens = nltk.pos_tag(tokens)
    for posTag in tagged_tokens:
        # Tagging is case sensitive, so lower needs to be after
        lower_word = posTag[0].lower()

        #Keep all verbs, adj, noun, adv
        if (posTag[1].startswith("V") or posTag[1].startswith("J")
                or posTag[1].startswith("N") or posTag[1].startswith("R")):
            append_pos.append(lower_word)

    #Append each tokenized tweet in the list
    tokenized_text.append(append_pos)
Example #39
0
# Opening input file in read mode
inputF = open(inputFilePath, "r")
# Opening the intermediate and the final file in write mode
outputF = open(intermFilePath, "w")
outputF1 = open(outputFilePath, "w")

# Holds all tag patterns
taglist = []

# Reading through the vocab file
# Tagging each line
# Extract just the tag from the tagged line and insert it into the list
for line in inputF:
    tokens = nltk.word_tokenize(line)
    tagggedT = nltk.pos_tag(tokens)
    temp = " "
    temp2 = " "
    for eachToken in tagggedT:
        temp2 = temp2 + eachToken[0] + "/" + eachToken[1] + " "
        temp = temp + eachToken[1] + " "
    taglist.append(temp)
    temp2 = temp2 + "\n"
    outputF.write(temp2)
tagset = set(taglist)
# print tagset

count = 0
# Fetting the tag patterns which appear more than 100 times
for val in tagset:
    mt = re.match(r'^(( [A-Z]+)? ([A-Z]+ )?NN[A-Z]* )$', val)
Example #40
0
	def posTagging(self, corpus):
		tags = []
		for sentence in corpus:
			tag = nltk.pos_tag(sentence)
			tags.append(tag)
		return tags
Example #41
0
def ie_preprocess(document):
   sentences = nltk.sent_tokenize(document) 
   sentences = [nltk.word_tokenize(sent) for sent in sentences] 
   sentences = [nltk.pos_tag(sent) for sent in sentences] 
   return(sentences)
Example #42
0
def tagList(jsonList: list, whWord: str, collocate: str, context: str):
    for obj in jsonList:
        if not obj["sentence"] or obj["sentence"] == "":  # Skip conditions
            continue

        sent = obj["sentence"]
        #iterate through list reversed, find the verb first and then the wh

        obj["clean_sentence"] = None

        tok_sents = sent_tokenize(sent)
        for s in tok_sents:
            if whWord.lower() in s.lower() and context.lower() in s.lower():
                sent = s
                obj["clean_sentence"] = s
                break

        tagged = pos_tag(word_tokenize(sent))

        clauseType = None
        modal = None
        verb = None

        modals = [
            'am', 'is', 'are', 'was', 'were', 'being', 'been', 'be', 'have',
            'had', 'has', 'do', 'does', 'did', ' can', 'could', 'may', 'might',
            'shall', 'should', 'will', 'would', 'must'
        ]

        context_wh, wh_collocate = f.get_sets_backwards(
            tagged, context, whWord, collocate)

        obj['context_wh'] = str(context_wh)
        obj['wh_collocate'] = str(wh_collocate)

        obj['wh'] = whWord
        obj['phrase'] = context

        try:
            #tag relative clauses
            if f.x_in_set("N", context_wh, is_pos=True) or f.x_in_set(
                    "DT", context_wh, is_pos=True) or f.x_in_set(
                        "JJ", context_wh, is_pos=True):
                clauseType = "Relative Clause"
            #tag infinitive clausesroorunl
            elif f.x_in_set("to", wh_collocate, is_pos=False):
                clauseType = "Non-Finite"
                verb = f.get_pos_word_in_set(wh_collocate, 'V')
            #wh__modal__NNP__VB
            elif f.x_in_set(modals, wh_collocate, is_pos=False):
                clauseType = "Modal"
                modal = f.get_pos_word_in_set(wh_collocate, 'M')
                verb = f.get_pos_word_in_set(wh_collocate, 'V')
            else:
                clauseType = "Finite"
                verb = f.get_pos_word_in_set(wh_collocate, 'V')

        except:
            print("BROKE HERE: ")
            print(obj["resNumber"])
            print(sent)
            print(tagged)
            print(context_wh)
            break

        obj['clauseType'] = clauseType
        obj['modal'] = modal
        obj['verb'] = verb

    return jsonList
Example #43
0
short_pos = open("short_reviews/positive.txt", "r").read()
short_neg = open("short_reviews/negative.txt", "r").read()

# move this up here
all_words = []
documents = []

#  j is adject, r is adverb, and v is verb
#allowed_word_types = ["J","R","V"]
allowed_word_types = ["J"]

for p in short_pos.split('\n'):
    documents.append((p, "pos"))
    words = word_tokenize(p)
    pos = nltk.pos_tag(words)
    for w in pos:
        if w[1][0] in allowed_word_types:
            all_words.append(w[0].lower())

for p in short_neg.split('\n'):
    documents.append((p, "neg"))
    words = word_tokenize(p)
    pos = nltk.pos_tag(words)
    for w in pos:
        if w[1][0] in allowed_word_types:
            all_words.append(w[0].lower())

save_documents = open("pickled_algos/documents.pickle", "wb")
pickle.dump(documents, save_documents)
save_documents.close()
            x = i.split('.')[1]
            y = i.split('.')[0]
            word_tokens[num][n] = y
            word_tokens[num].append(x)
        elif (len(i) == 1):
            word_tokens[num][n] = ''

##stripping leading and lagging spaces from words and appending them to a new list
word_tokens_ = []
for i in word_tokens:
    temp = [j.strip(' ') for j in i]
    temp_1 = [i for i in temp if len(i) > 2]
    word_tokens_.append(temp_1)

##creating pos_tags for the corpus
pos_tags = [nltk.pos_tag(i) for i in word_tokens_]

##removing NNPs, Prepositions, modular verbs or fillers, foreign words
NNP = []
CD = []
IN = []
MD = []

for i in pos_tags:
    for k, v in i:
        if v == 'NNP':
            NNP.append(k)
        elif v == 'CD':
            CD.append(k)

        if v == 'MD':
Example #45
0
def answer_eight():
    parts_of_speech = nltk.pos_tag(text1)
    count = nltk.FreqDist(tag for (word, tag) in parts_of_speech)
    answer = count.most_common()[:6]
    output = [i for i in answer if i[0] != ',']
    return output
def fetch_words_from_news(url):
    # return [[],[]] 0: en, 1:tr
    article = Article(url)
    article.download()
    article.parse()

    for_nltk = []
    news_text = article.text
    for_nltk.append(article.text)
    news_text = news_text.upper()
    news_text_wo_rn = news_text.replace('\n', ' ')
    news_text_wo_rn = news_text_wo_rn.replace('\r', ' ')
    news_text_list = news_text_wo_rn.split(' ')
    news_text_list = set(news_text_list)
    tokenized_sents = [word_tokenize(i) for i in for_nltk]

    # remove punctuations from list

    res = []
    new_res = []

    #s.translate(None, string.punctuation)

    #res = [s.translate(str.maketrans('', '', string.punctuation)) for s in tokenized_sents[0]

    for tixt in tokenized_sents[0]:
        new_tixt = ''.join(
            c.translate(str.maketrans('', '', string.punctuation + '“”'))
            for c in tixt if c not in string.punctuation + '“”')
        res.append(new_tixt)

    for d in res:
        if not d == '':
            new_res.append(d)

    capitalized_new_res = [KAP.upper() for KAP in new_res]

    capitalized_setted_new_res = set(capitalized_new_res)

    # delete one len item

    more_than_one_len_CSNR = []

    for e in capitalized_setted_new_res:
        if not len(e) < 2:
            more_than_one_len_CSNR.append(e)

    # delete numbers

    digitless_more_than_OLC = []

    for g in more_than_one_len_CSNR:
        if g.isalpha():
            digitless_more_than_OLC.append(g)

    tags_of_diggless = [nltk.pos_tag(f) for f in digitless_more_than_OLC]
    tags_of_diggless_2 = nltk.pos_tag(digitless_more_than_OLC)

    prepless_digitless_MTO = []

    for h in digitless_more_than_OLC:
        if not h.lower() in stop_words:
            prepless_digitless_MTO.append(h)

    if_word_in_cor_PDMTO = []
    TR_if_word_in_cor_PDMTO = []

    for g in prepless_digitless_MTO:
        if g.lower() in words.words():
            if_word_in_cor_PDMTO.append(g)
            tr.set_text(g)
            TR_if_word_in_cor_PDMTO.append(tr.translate())

    return [if_word_in_cor_PDMTO,
            TR_if_word_in_cor_PDMTO]  # return [[],[]] 0: en, 1:tr
Example #47
0
    no_punct = [token for token in sent_t if token not in punctuation]
    temp.append([token for token in no_punct if token not in stops])

sent_tokens = temp
word_tokens = [token for token in word_tokens if token not in punctuation]
word_tokens = [token for token in word_tokens if token not in stops]

# Stems are basic versions of words
stemmer = PorterStemmer()
stems = {token: stemmer.stem(token) for token in word_tokens}

# Lemmas look at the meaning of the word
lemmatizer = WordNetLemmatizer()
lemmas = {token: lemmatizer.lemmatize(token) for token in word_tokens}

tagged_sent = [nltk.pos_tag(sent) for sent in sent_tokens]

tagged_words = nltk.pos_tag(word_tokens)
ne_chunked = nltk.ne_chunk(tagged_words, binary=True)

vader_analyzer = SentimentIntensityAnalyzer()
polarity_scores = [vader_analyzer.polarity_scores(sent) for sent in sentences]

top_tokens(word_tokens)
top_stems(stems)
top_lemmas(lemmas)
top_nouns_verbs(tagged_sent)
top_entities(ne_chunked)
top_sentiment_sentence(polarity_scores)
identify_weird_words(word_tokens)
disp_text(text)
Example #48
0
        return conf


# # training-data
short_pos = open("short_reviews\positive.txt", "r").read()
short_neg = open("short_reviews\\negative.txt", "r").read()
documents, all_words = [], []

# J is adjective, R is adverb, and V is verb
# allowed_word_types = ["J","R","V"]
allowed_word_types = ["J"]

for p in short_pos.split("\n"):
    documents.append((p, "pos"))
    words = word_tokenize(p)
    short_pos_words = nltk.pos_tag(words)
    for w in short_pos_words:
        if w[1][0] in allowed_word_types:
            all_words.append(w[0].lower())

for n in short_neg.split("\n"):
    documents.append((n, "neg"))
    words = word_tokenize(n)
    short_neg_words = nltk.pos_tag(words)
    for w in short_neg_words:
        if w[1][0] in allowed_word_types:
            all_words.append(w[0].lower())

documents_temp = open("pickled_algorithms/documents.pickle", "rb")
documents = pickle.load(documents_temp)
documents_temp.close()
Example #49
0
parser = BeautifulSoup(html, 'html.parser')

texts = parser.findAll('p', attrs={'class': 'body-text'})

# remove empty tags
texts = [text for text in texts if len(text) > 0]

# remove HTML tags
texts = [re.sub(r"<.*?>", "", str(text)) for text in texts]

texts = texts[0]

#for text in texts:
tokens = nltk.word_tokenize(texts)

tokens_pos = nltk.pos_tag(tokens)

chunks = nltk.ne_chunk(tokens_pos)
tagged_chunks = [chunk for chunk in chunks if type(chunk) != tuple]
#print(tagged_chunks)
''' Annotation guidelines:
For annotation we looped through all tokens in the text and annotated each token
if it is either (part of) a person (PERSON), location (GPE) or organization (ORGANIZATION)

 '''
''' CoNLL2003 format:
Token	POS-tag		Gold standard NER tag	Actual Tag
Poep	N			O						Person
 '''

#Manually annotate and store (token_index, tag)
Example #50
0
def extract_data(status):

    url = "traffic_data.txt"
    names = ['message', 'outcome']
    dataset = pandas.read_csv(url, names=names)

    dataset_x = dataset["message"]
    dataset_y = dataset["outcome"]

    cv = TfidfVectorizer(min_df=1, stop_words='english')

    x_train, x_test, y_train, y_test = model_selection.train_test_split(
        dataset_x, dataset_y, test_size=0.8, random_state=2)

    x_train_cv = cv.fit_transform(x_train)

    x_test_cv = cv.transform(x_test)

    classifier = LogisticRegression()
    classifier.fit(x_train_cv, y_train)

    predictions = classifier.predict(x_test_cv)
    print(accuracy_score(y_test, predictions))  #Accuracy printing

    sentence = status
    sentence = sentence.lower()
    words = word_tokenize(sentence)

    #Make of NER code.

    filename = "place.txt"
    data_place = pandas.read_csv(filename, names=['place'])
    data_placename = data_place['place']

    jam_place_integer = 0  #This will be needed for first and second phase
    jam_place2_integer = 0  #This will be needed for third phase
    jam_place3_integer = 0  #This will be needed for third phase because 'to' will have two places
    enter_to_second_phase = 0
    enter_to_third_phase = 0
    enter_to_fourth_phase = 0

    if 'near' in words or 'at' in words or 'of' in words:
        for x in range(0, len(words)):
            if words[x] == 'near' or words[x] == 'at' or words[x] == 'of':
                jam_place_integer = x + 1
    else:
        enter_to_second_phase = 1

    if enter_to_second_phase == 1:
        if 'in' in words:
            for x in range(0, len(words)):
                if words[x] == 'in':
                    if (str(words[x + 1]) == 'traffic'
                            or str(words[x + 1]) == 'jam'
                            or str(words[x + 1]) == 'grid'
                            or str(words[x + 1]) == 'lock'):
                        enter_to_third_phase = 1
                    else:
                        pos_tag_of_next_word = nltk.pos_tag(
                            word_tokenize(words[x + 1]))
                        word, tag = zip(*pos_tag_of_next_word)
                        pos_tag_of_next_word_str = str(''.join(tag))
                        if pos_tag_of_next_word_str == 'NN' or pos_tag_of_next_word_str == 'NNP':
                            jam_place_integer = x + 1
        else:
            enter_to_third_phase = 1

    if enter_to_third_phase == 1:
        if 'to' in words:
            for x in range(0, len(words)):
                if words[x] == 'to':
                    jam_place2_integer = x + 1
                    #Now finding if previous word is a place also
                    pos_tag_of_previous_word = nltk.pos_tag(
                        word_tokenize(words[x - 1]))
                    word, tag = zip(*pos_tag_of_previous_word)
                    pos_tag_of_previous_word_str = str(''.join(tag))
                    if pos_tag_of_previous_word_str == 'NN' or pos_tag_of_previous_word_str == 'NNP':
                        jam_place3_integer = x - 1

        else:
            enter_to_fourth_phase = 1

    #This method creates problem which name has two separate parts. like Manik mia
    if enter_to_fourth_phase == 1:
        for w in words:
            for x in range(data_placename.count()):
                if str(w) == str(data_placename[x]):
                    jam_place = w

    #This method creates problem which name has two separate parts.

    jam_place_final_result = ''

    if jam_place_integer != 0:
        jam_place = words[jam_place_integer]
        jam_place_final_result = jam_place

    if enter_to_third_phase == 1 and enter_to_fourth_phase == 0:
        if jam_place3_integer == 0:
            jam_place = words[jam_place2_integer]
            jam_place_final_result = jam_place
        if jam_place3_integer != 0:
            jam_place2 = words[jam_place2_integer]
            jam_place3 = words[jam_place3_integer]
            jam_place_final_result = jam_place3 + ' to ' + jam_place2

    if enter_to_fourth_phase == 1:
        jam_place_final_result = jam_place

    #End of NER

    test_line_tfidf = cv.transform([sentence])
    prediction = classifier.predict(test_line_tfidf)

    final_result = ''

    if str(prediction) == '[0]':
        final_result = 'There may be no traffic jam at ' + jam_place_final_result
    if str(prediction) == '[1]':
        final_result = 'There may be traffic jam at ' + jam_place_final_result
    if str(prediction) == '[2]':
        final_result = 'Someone is trying to know the road condition of ' + jam_place_final_result

    print(final_result)
Example #51
0
# 텍스트 분석을 위해 nltk 모듈을 불러온다
import nltk

# 전처리하고자 하는 문장을 String 변수로 저장한다
sent1 = 'My only regret in life is that I did not drink more wine.'
sent2 = 'I drink to make other people more interesting.'
sent3 = 'An intelligent man is sometimes forced to be drunk to spend time with his fools.'

# 각 문장을 토큰화한 후 품사 태깅을 해 결과를 출력한다
print('POS tagging Sentence 1:')
tokens1 = nltk.word_tokenize(sent1)  # 문장을 토큰화한다
print(nltk.pos_tag(tokens1))  # 토큰화한 문장을 품사 태깅해 출력한다

print('POS tagging Sentence 2:')
tokens2 = nltk.word_tokenize(sent2)  # 문장을 토큰화한다
print(nltk.pos_tag(tokens2))  # 토큰화한 문장을 품사 태깅해 출력한다

print('POS tagging Sentence 3:')
tokens3 = nltk.word_tokenize(sent3)  # 문장을 토큰화한다
print(nltk.pos_tag(tokens3))  # 토큰화한 문장을 품사 태깅해 출력한다
def toke_n_tag(text):
    pos_tagged_text = pos_tag(word_tokenize(text))
    return (pos_tagged_text, text)
def extractFeaturesAndWriteBio(READ_PATH,file_type):
    
    

    global ALL_poems,bio,cnt, start_time

    inp=0
    sub_cnt=0
    words_total=0
    lines_total=0

    pause_every = 0

    for subdir, dirs, files in os.walk(READ_PATH):

        # RANDOM SELECT
        random.shuffle(files)


        for file in files:
            
            num_of_files = len(files)-1 # deduct the DS_store
            #print (num_of_files,'readDirectory',READ_PATH)
            
            if file_type in file  and 'readme' not in file:

                # ID
                id=file.split(".")[0]
                #print "\n\n*********\nID:",id

                filenames.append(id)
                cnt+=1

                # print('')
                #print('')
                # print('OPENED:',id)
                # print('')
                #print('')

                ##############
                #  HOW MANY? #
                ##############
                sub_cnt+=1
                if sub_cnt>=int(inp):
                    if int(inp) != 0:
                        end_time = time.time()
                        es = end_time-start_time
                        print sub_cnt, "poems,\n",lines_total,"lines,\n",words_total,"words \ngenerated in\n",("%.2f" % es),"seconds"
                        
                    words_total=0
                    lines_total=0

                    # RESTART

                    sub_cnt=0
                    inp = raw_input("\n\n^^^^^^^^^^^^^^\n\nHow many poems do u want? ")

                    if not inp:
                        print "You entered nothing! 10 poems will be generated."
                        inp=10

                    sleep_time = raw_input("\nSleep duration?")
                    if not sleep_time:
                        print "You entered no time! 10 second wait assigned."
                        sleep_time=10

                    pause_every = raw_input("\nPause every 1 or 2 or ...?")
                    if not pause_every:
                        print "You entered nothing! Pause will occur every 10 poems."
                        pause_every=10


                    print "\n\n^^^^^^^^^^^^^^^"
                    start_time = time.time()

                print 'Poem #',sub_cnt

                poem_replaced = ""
                replacement_word = ""
                author=""
                titles=""
                title=""
                new_title=""

                replaced_ls =[]
                new_titles_ls = []
                quit_language=0

                #################################################################
                # Load  POEM TEXT FILE (based on id extracted from Alchemy JSON)    #
                #################################################################

                txt_fn_path = DATA_DIR + READ_TXT_PATH + id.split("_")[1]+".txt"
                #print "txt_fn_path:",txt_fn_path

                if os.path.isfile(txt_fn_path) and cnt>0:
                    txt_data=open(txt_fn_path).read()

                    # http://blog.webforefront.com/archives/2011/02/python_ascii_co.html
                    # txt_data.decode('ISO-8859-2') .decode('utf-8')
                    # unicode(txt_data)

                    author=txt_data.split("****!****")[0].strip(' \t\n\r')
                    
                    title=txt_data.split("****!****")[1].strip(' \t\n\r')
                    
                    bio=txt_data.split("****!****")[2]#.strip(' \t\n\r')

                    ######  CLEAN BIO
                    bio.replace("\t","&#9;")
                    bio.replace("\n"," <br>")
                    bio.replace("\r"," <br>")
                    poem_replaced=bio
                    #print poem_replaced

                    ###############################
                    # REPLACE AUTHOR NAME
                    ##############################
                    author_ln=author.split(" ")[-1]
                    author_fn=author.split(" ")[:-1]
                    #
                    #poem_replaced = poem_replaced.replace(author_ln,"Jhave")

                    #######################
                    # fake AUTHOR
                    #######################
                    
                    new_author= " ".join(random.choice(authors).split(" ")[1:-2])+" "+random.choice(authors).split(" ")[-2]
                    

                    #######################
                    # replace BOOK TITLES
                    #######################
                    #print "TITLES"]
                    new_title = getNewTitle("title").encode('utf-8')
                             

                    ############################
                    # replace years with another
                    ############################
                    for w1 in poem_replaced.split("("):
                        for w2 in w1.split(")"):
                            if w2 is not None and w2.isdigit():
                                new_num = random.randint(int(w2)-5,int(w2)+5)
                                #print "REPLACING #:",w2,new_num
                                poem_replaced = poem_replaced.replace(w2,str(new_num))
                                replaced_ls.append(new_num)                            
                                               

                    #################
                    # Load JSON     #
                    #################
                    response = loadJSONfile(READ_JSON_PATH+"poetryFoundation_"+id.split("_")[1]+"_Alchemy_JSON.txt")

                    if response != "failed":

                        if response.get('entities') is not None:
                            for idx,entity in enumerate(response['entities']):

                                #print idx
                                ce = entity['text'].replace("0xc2"," ")
                                ce = ce.replace("0xe2","'")
                                ce = re.sub('(' + '|'.join(import_utilities.chars.keys()) + ')', import_utilities.replace_chars, ce)
                                ce = ce.encode('utf-8')

                                try:
                                    content = ce.decode('utf-8').encode('ascii', 'xmlcharrefreplace')
                                except UnicodeDecodeError:
                                    "AAAARGGGGHHH!!!!"

                                if content in poem_replaced:
                                                       
                                    ################################################
                                    # Replace similar entities from other JSON     #
                                    ################################################
                                    replacement_entity = findSimilarEntityinRandomJSON(content,entity['type'])

                                    cr = re.sub('(' + '|'.join(import_utilities.chars.keys()) + ')', import_utilities.replace_chars, replacement_entity)

                                    poem_replaced = poem_replaced.replace(content,replacement_entity)
                                    replaced_ls.append(replacement_entity)
                    

                    ##########################
                    #   POS REPLACMENT       #
                    ##########################

                    token_tuples = nltk.word_tokenize(poem_replaced)
                    tt = nltk.pos_tag(token_tuples)

                    #################
                    #  ADJECTIVES   #
                    #################
                    for i in tt:
                        if "/i" not in i[0] and len(i[0])>3 and i[0] != "died":
                            origw =  re.sub('(' + '|'.join(import_utilities.chars.keys()) + ')', import_utilities.replace_chars, i[0])
                            origw =import_utilities.strip_punctuation(origw) 
                            if i[1]=='JJ' :
                                JJr = random.choice(JJ)
                                # # JJr =  re.sub('(' + '|'.join(import_utilities.chars.keys()) + ')', import_utilities.replace_chars, JJr)
                                # JJr = import_utilities.strip_punctuation(JJr)
                                JJr = import_utilities.moveBeginAndEndPunctuationFromStrToString(i[0],JJr.lstrip().lstrip())
                                
                                if i[0].istitle():
                                    JJr = JJr.title()

                                poem_replaced = re.sub(r'\b' + import_utilities.strip_punctuation(i[0]) + r'\b', JJr, poem_replaced,1)#poem_replaced.replace(i[0],JJr,1)
                                replaced_ls.append(JJr)
                            if i[1]=='RB':
                                RBr = random.choice(RB)
                                RBr = import_utilities.moveBeginAndEndPunctuationFromStrToString(i[0],RBr.lstrip().lstrip())

                                if i[0].istitle():
                                    RBr = RBr.title()
                                poem_replaced = re.sub(r'\b' + import_utilities.strip_punctuation(i[0])  + r'\b', RBr, poem_replaced,1)
                                replaced_ls.append(RBr)


                    ########################
                    # IS IT ENGLISH?       #
                    ########################
                    for line  in poem_replaced.split('\n\r'):

                        

                        if len(line)>0 :
                            if "english" not in import_utilities.get_language(line):
                                quit_language+=1
                                #print "NOT english:",quit_language,line
                            else:
                                quit_language-=1

                    
                    #########################
                    #   SYNSET REPLACE      #
                    #########################
                    for idx,word in enumerate(poem_replaced.split(' ')):


                        

                        if "<br>" not in word and "&#9;" not in word and len(word)>0 and "~~~~!~~~" not in word:


                            words_total+=1

                            #########################
                            #   PRONOUN ' VERB      #
                            #########################
                            if len(word.split("'"))>1:
                                if word.split("'")[0] in personal_pronouns:
                                    replacement_word = random.choice(personal_pronouns)+"'"+word.split("'")[1]+' '
                                poem_replaced.replace(word,replacement_word)             
                                #print "word,",word,"replacement_word:",replacement_word
                           
                            ####################################################
                            # Replacement of OTHERs                            #
                            ####################################################

                            elif not word.lower().strip(" \n\t\r") in stopwords.words('english'):

                                # take off leading brackets, commas etc...
                                word_punct_nopunct = import_utilities.strip_punctuation_bool(word)
                                word_nopunct = word_punct_nopunct['word'].strip(" \n\t\r")
                                word_punct = word_punct_nopunct['punct']
                                punct_bool = word_punct_nopunct['punct_bool']

                             

                                #######################################################
                                # MAIN EXCHANGE PROCESS CALL >>>>>>>   GET THE SYNSET #
                                #######################################################    
                                if word_nopunct[-4:].lower()=="here":
                                    similarterm=random.choice(import_utilities.heres)
                                else:
                                    #print "WORD:",word_nopunct
                                    if len(word_nopunct)>2:
                                        similarterm = import_utilities.find_synset_word(word_nopunct)#(word.lstrip().rstrip())

                                
                                ############################################
                                # manually get rid of some terrible choices
                                ############################################
                                if similarterm == "ilk":
                                    ##print "like"
                                    similarterm = "like"
                                if similarterm == "ope":
                                    ##print "doth"
                                    similarterm = "does"
                                if similarterm == "information technology":
                                    ##print "doth"
                                    similarterm = "it"
                                if similarterm == "Nox":
                                    ##print "doth"
                                    similarterm = "dark"

                                #######################################                      
                                # abbreviations for f*****g states!   #
                                #######################################
                                if word_nopunct.upper() in import_utilities.state_abbrev and word_nopunct.lower() not in stopwords.words('english') and "me," not in word:
                                    tmp = similarterm
                                    if word_nopunct == "oh": 
                                        similarterm = random.choice(import_utilities.exclaims)
                                    else:
                                        similarterm = random.choice(RESERVOIR)
                                    #print word_nopunct," replaced by", tmp, "replaced with:",similarterm, "in:",line

                                ##############
                                # hyphenated #
                                ##############
                                hyp =word.split("-")
                                #print word,len(hyp)
                                if len(hyp) >1:
                                    similarterm=""
                                    for w in hyp:
                                        if len(w) > 2:
                                            similarterm +=  import_utilities.find_synset_word(w)+"-"
                                    similarterm = import_utilities.strip_underscore(similarterm[:-1])
                                    #print "hyphenated:",word,"replaced by: "+similarterm
                                        


                                
                                #########################################################    
                                # is it a TRUNCATED VERB slang as in singin or wishin   #
                                #########################################################
                                if similarterm == word_nopunct and len(word)>2 and 'in' in word_nopunct[-2:]:
                                    similarterm = import_utilities.find_synset_word(word_nopunct+'g')
                                    ## #print "TRUNCATED SLANG word: '"+word+"'",similarterm
                                    interim = import_utilities.lemma(similarterm)
                                    ## #print interim
                                    similarterm = import_utilities.conjugate(interim, tense=import_utilities.PARTICIPLE, parse=True)[:-1] 
                                    # # # #print word,"widx:",widx," line_pos_tags[widx][0]:",line_pos_tags[widx][0]," line_pos_tags[widx][1]:",line_pos_tags[widx][1]
                                   

                                #################      
                                # SWEAR WORD    #
                                #################
                                ##print "at the garden of if:", word
                                if word_nopunct in import_utilities.curses:
                                    similarterm = random.choice(import_utilities.curses)
                                    ##print "SWEAR WORD word: '"+word+"'",similarterm


                                if len(hyp) >1:
                                    replacement_word = similarterm
                                else:
                                    replacement_word = word.replace(word_nopunct, similarterm)
                                    replacement_word = import_utilities.strip_underscore(replacement_word)
                                    replacement_word = import_utilities.replaceNumbers(replacement_word)

                                #########################
                                # RESERVOIR_OF_WEIRDNESS  #
                                #########################  

                                if word_nopunct.lower() in import_utilities.impera:
                                    replacement_word=random.choice(import_utilities.impera)
                                    #print word,"IMPERA:",replacement_word
                                elif word_nopunct.lower() in import_utilities.conjuncts:
                                    replacement_word=random.choice(import_utilities.conjuncts)
                                    #print word," CONJUNCTION replaced with",replacement_word
                                elif word_nopunct.lower() in import_utilities.indef_prono:
                                    replacement_word=random.choice(import_utilities.indef_prono)
                                    #print word," INDEF_prono replaced with",replacement_word
                                elif word_nopunct.lower() in import_utilities.prepo:
                                    replacement_word=random.choice(import_utilities.prepo)
                                    #print word," prepo replaced with",replacement_word
                                elif word_nopunct.lower() in import_utilities.rel_prono:
                                    replacement_word=word
                                    #print word," rel_prono LEAVE alone: ",replacement_word
                                elif word_nopunct.lower()[-2:] =="ly":
                                    replacement_word=import_utilities.strip_underscore(import_utilities.find_synset_word(word))#(word[:-2])
                                    #print word," ADVERB: ",replacement_word
                                    # if replacement_word[-2:] !="ly":
                                    #     replacement_word +="ly"
                                                                            
                                else:
                                    if len(hyp) <2 and "like" not in word_nopunct and import_utilities.singularize(word_nopunct) ==  import_utilities.singularize(replacement_word) and word_nopunct.lower() not in import_utilities.stopwords_ls:

                                        if word_nopunct not in RESERVOIR and quit_language<0 and import_utilities.countPunctuation(word)<1 and len(word_nopunct)>3 and not word_nopunct.istitle(): 
                                            
                                            #print "ADDING",word,"to reservoir"
                                            RESERVOIR.append(word)
                                            
                                            replacement_word = random.choice(RESERVOIR)
                                            #print word_nopunct,"replaced from reservoir with", replacement_word
                                       # print "'"+word_nopunct+"'  vs RESERVOIR  replacement_word:",replacement_word #,"    new_line:",new_line
                                if quit_language>1 and not word_nopunct.istitle():
                                    #print quit_language, "Probably foreign language: make a word salad in english"
                                    replacement_word = random.choice(RESERVOIR)
                                    #print word_nopunct,"OTHER replaced from reservoir with", replacement_word
                                

                                # REPLACEMENT
                                poem_ls = poem_replaced.split(' ')
                                idx =  poem_ls.index(word)


                                # #print idx,",", poem_ls[idx],",", word ,",",replacement_word

                                if poem_ls[idx]==word:
                                    poem_ls[idx]=replacement_word
                                poem_replaced = " ".join(poem_ls)


                                #poem_replaced = poem_replaced.replace(word,replacement_word)



                    # CORRECT the "A" to "An"    
                    for idx,word in enumerate(poem_replaced.split(" ")):
                        # poem_replaced = poem_replaced+"A organism"
                        if len(word)>0 and word[0].lower() in the_vowels and poem_replaced.split(" ")[idx-1].lower() =="a" :      
                                if poem_replaced.split(" ")[idx-1] =="a":
                                    old_str = "a "+poem_replaced.split(" ")[idx]    
                                    new_str = "an "+poem_replaced.split(" ")[idx]
                                else:
                                    old_str = "A "+poem_replaced.split(" ")[idx]    
                                    new_str = "An "+poem_replaced.split(" ")[idx]
                                poem_replaced = poem_replaced.replace(old_str,new_str)

                        # poem_replaced = poem_replaced+"An consonant"
                        if len(word)>0 and word[0].lower() not in the_vowels and poem_replaced.split(" ")[idx-1].lower() =="an" :      
                                if poem_replaced.split(" ")[idx-1] =="an":
                                    old_str = "an "+poem_replaced.split(" ")[idx]    
                                    new_str = "a "+poem_replaced.split(" ")[idx]
                                else:
                                    old_str = "An "+poem_replaced.split(" ")[idx]    
                                    new_str = "A "+poem_replaced.split(" ")[idx]
                                poem_replaced = poem_replaced.replace(old_str,new_str)
                                #print "FOUND correction needed",old_str,new_str


                    #########################
                    #   WRITE SINGLE POEM   #
                    #########################
                    tmp_poem=""   

                    # poem_replaced.replace("\t","&#9;")
                    # poem_replaced.replace("\n"," <br>")
                    # poem_replaced.replace("\r"," <br>")

                    HTML_poem=""
                    for line in poem_replaced.split("\n"):
                        lines_total+=1
                        #print "LINE", line
                        HTML_poem += line+"<br>"

                    if len(response) >0 and len(id.split("_"))>1:
                        # ALL_poems = ALL_poems_intro + " ".join(i for i in ALL_poems.split("</h2>.")[0:])+"<br><br>~~~~~~~~~~~~~~~~~~~~~~~~~~<br>[ A poem generated from template : <b>"+ author+"</b>, <i>"+ title +"</i> ]<br><br><b>"+new_title+"<br><br></b>"+HTML_poem
                        ALL_poems += "<br><br>~~~~~~~~~~~~~~~~~~~~~~~~~~<br>[ A poem generated from template : <b>"+ author+"</b>, <i>"+ title +"</i> ]<br><br><b>"+new_title+"<br><br></b>"+HTML_poem

                        tmp_poem= "[A poem generated from template: "+ author+", '"+ title +"'']\n\n'"+new_title+"'\nby\n"+new_author+"\n\n"+poem_replaced

                        
                        #####################
                        #                   #
                        #                   #
                        #     PAUSE IT      #
                        #                   #
                        #                   #
                        #####################

                        # sleep_time=0.03*sub_cnt
                        #sleep_time=30.03*sub_cnt
                        #print "sub_cnt=",sub_cnt # ,"sleep_time=",sleep_time
                        if (int(sub_cnt)%int(pause_every) == 0 and int(sub_cnt) !=0):
                            time.sleep(int(sleep_time))

                        # if sub_cnt>=1:
                        #     raw_input("Press Enter to continue...")

                        #####################
                        #                   #
                        #                   #
                        #       PRINT       #
                        #                   #
                        #                   #
                        #####################

                        print "\n******\n"+tmp_poem


                        txt_fn = id.split("_")[1]+"_POEMs.txt"

                        # WRITE_BIO_PATH = DATA_DIR+"generated/POEMS/POEMS_"+datetime.datetime.now().strftime('%Y-%m-%d_%H')+"/"
                        # if not os.path.exists(WRITE_BIO_PATH):
                        #         os.makedirs(WRITE_BIO_PATH)

                        txt_fn_path = GENERATED_DIR+txt_fn
                        f_txt=open(txt_fn_path,'w')
                        f_txt.write(tmp_poem)#.encode('utf-8'))       
                        f_txt.close();   
                        #print "\nTXT file created at:",txt_fn_path

                        
                        # #######
                        # #   write them all.... wasteful... but useful if run is interrupted....
                        # ###########    
                        # ALL_poems = ALL_poems.replace("$$datetime$$",datetime.datetime.now().strftime('%Y-%m-%d at %H:%M'))
                        # ALL_poems = ALL_poems.replace("$$cnt$$",str(cnt))
                        # print "cnt",cnt
                        # ALL_poems = ALL_poems.replace("$$gentime$$",str(time.time() - start_time))

                        # # ALL POEMS
                        # txt_fn = datetime.datetime.now().strftime('%Y-%m-%d_%H')+"_poetryFoundation_generatedPOEMS_"+type_of_run+".html"
                        # txt_fn_path = DATA_DIR+"generated/POEMS/"+txt_fn
                        # f_txt=open(txt_fn_path,'w')
                        # f_txt.write(ALL_poems+"</hmtl>")       
                        # f_txt.close();   
                        # print "\nTXT file created at:",txt_fn_path





                    else:
                        "~~~~~~~~~~~~~~~~!!!!!!!!!! EMPTY response:", author
    VBN verb, past participle
    VBP verb, sing. present, non-3d
    VBZ verb, 3rd person sing. present
    WDT wh-determiner
    WP wh-pronoun
    WP$ possessive wh-pronoun
    WRB wh-abverb"""

tag_to_text_dict = {}
for line in tag_to_text.split("\n"):
	 line = line.strip()
	 tag_to_text_dict[line.split()[0]] = " ".join(line.split()[1:])


sentence = input("Enter your sentence : ")
parts_of_speech = nltk.pos_tag(nltk.word_tokenize(sentence))
print("\nThe parts of speech in your sentence are : \n")
for tup in parts_of_speech:
		
	if(tup[0]!='.' and tup[0]!=','):
	    print(tup[0] + " : " + tag_to_text_dict[tup[1]])


print("\nEnter a word you would like synonyms and antonyms for : ")
word = Word(input())

synonyms = list(set([l.name() for syn in word.get_synsets() for l in syn.lemmas()]))
antonyms = list(set([ant.name() for syn in word.get_synsets() for l in syn.lemmas() for ant in l.antonyms()]))

print(f" Synonyms : "+",".join(synonyms))
print(f" Antonyms : "+",".join(antonyms))
import openpyxl
wb = openpyxl.load_workbook('training.xlsx')
ws = wb.get_sheet_by_name('training_set')

import nltk
from nltk import word_tokenize, pos_tag

for i in range(2, 1785):
    count = 0
    f_essay = ws.cell(row=i, column=3)
    essay = f_essay.value

    import re
    letters_only = re.sub("[^a-zA-Z]", " ", essay)
    essay = letters_only.lower()

    x = nltk.word_tokenize(essay)
    y = nltk.pos_tag(x)
    nouns = [word for word, pos in y if pos == 'JJ']
    count = len(nouns)
    ws.cell(row=i, column=10).value = count
wb.save('training.xlsx')
Example #56
0
                abnormal_lst.add(m.group(0))
                for i in range(1, 4):
                    split_lst.append(m.group(i))

        for t in token_sets[j]:
            if t not in abnormal_lst: all_tokens.append(t)
            else:
                all_tokens.extend(split_lst)

        token_sets[j] = all_tokens

        for i in range(len(token_sets[j])):
            if token_sets[j][i] not in Specific_NN:
                token_sets[j][i] = token_sets[j][i].lower()

    pos_tagged_tokens = [nltk.pos_tag(ts) for ts in token_sets]

    for i in range(len(pos_tagged_tokens)):
        for j in range(len(pos_tagged_tokens[i])):
            if pos_tagged_tokens[i][j][0] in words2pos:
                pos_tagged_tokens[i][j] = list(pos_tagged_tokens[i][j])
                pos_tagged_tokens[i][j][1] = pos_tagged_tokens[i][j][0]
                pos_tagged_tokens[i][j] = tuple(pos_tagged_tokens[i][j])

    interaction_collections = []

    # MD + TO + VB + NN
    for ts in pos_tagged_tokens:
        target_pos = ['MD', 'TO', 'VB', 'NN']
        interactions = extract_simple_sequences(ts, target_pos)
        if len(interactions) > 0:
Example #57
0
def text_tokens_2(
    text,
    lower_bound_percentage=0,
    higher_bound_percentage=1,
    minimal_word_length=0,
    remove_punctuations=False,
    remove_non_letter_characters=False,
    lemmatize_the_words=False,
    stemmer_the_words=False,
    part_of_speech_filter=False,
    english_text_filter=False,
    stop_words_filter=False,
    other_words_filter=False,
    remove_adjacent_tokens=False,
    tokens_form=True,
    stop_words=stop_words,
    some_other_words=some_other_words
):
    text = text.lower()
    if remove_punctuations:
        text = text.translate(str.maketrans('', '', string.punctuation))
    if remove_non_letter_characters:
        text = re.sub(r'[^a-zA-Z]', " ", text)
    tokens = nltk.word_tokenize(text)
    howmany_tokens = len(tokens)
    tokens = tokens[int(howmany_tokens *
                        lower_bound_percentage):int(ceil(howmany_tokens * higher_bound_percentage))]
    if part_of_speech_filter:
        token_pos = nltk.pos_tag(tokens)
        tokens = [word for (word, pos) in token_pos if pos.startswith('N') or pos.startswith('J')]
    if english_text_filter:
        tokens = [token for token in tokens if token in Englishtext]
    if lemmatize_the_words:
        tokens = [lemmatizer().lemmatize(token) for token in tokens]
        stop_words = set([lemmatizer().lemmatize(word) for word in stopwords.words('english')])
        some_other_words = set([lemmatizer().lemmatize(word) for word in some_other_words])
    if stemmer_the_words:
        tokens = [SnowballStemmer_().stem(token) for token in tokens]
        stop_words = set([SnowballStemmer_().stem(word) for word in stopwords.words('english')])
        some_other_words = set([SnowballStemmer_().stem(word) for word in some_other_words])
    tokens = [token for token in tokens if len(token) >= minimal_word_length]
    if other_words_filter:
        tokens = [token for token in tokens if token not in some_other_words]
    p = nltk.pos_tag(tokens)
    grammar = r"""
         NP: {(<DT>|<JJ>*)<NN.*>+(<CC><NN.*>+)?}    # noun phrase chunks
         VP: {<TO>?<VB.*>}          # verb phrase chunks
         PP: {<IN>}                 # prepositional phrase chunks
         CLAUSE: {<VP>?<NP>+}
         """
    cp = nltk.RegexpParser(grammar)
    if p:
        result = cp.parse(p)
        tree = result.subtrees()
        goodones = []
        badones = []
        for sub in tree:
            if sub.label() == 'CLAUSE':
                if len(list(sub)) >= 3:
                    goodones.append(sub)
                else:
                    badones.append(sub)
        tokens = []
        if goodones:
            for g in goodones:
                for w, po in g.leaves():
                    tokens.append(w)
        else:
            for b in badones:
                for w, po in b.leaves():
                    tokens.append(w)
        if stop_words_filter:
            tokens = [token for token in tokens if token not in stop_words]
        if remove_adjacent_tokens:
            remove_adjacent(tokens)
        if tokens_form:
            return tokens
        else:
            return ' '.join(tokens)
    else:
        return []
Example #58
0
    print(lStem.stem(str(i)))

#SnowBallStemmer
sStem = SnowballStemmer('english')
print("SnowBall Stemming : \n")
for i in tokens[0:50]:
    print(sStem.stem(str(i)))

#PorterStemmer
pStem = PorterStemmer()
print("Porter Stemming : \n")
for i in tokens[0:50]:
    print(pStem.stem(str(i)))

# POS-tagging
print("Part of Speech Tagging :\n", pos_tag(word_tokenize(text)))

# Lemmatization
lemmatizer = WordNetLemmatizer()
print("Lemmatization :\n")
for tok in tokens[0:50]:
    print(lemmatizer.lemmatize(str(tok)))

# Trigram
print("Trigrams :\n")
trigram = []
for x in tokens[0:20]:
    trigram.append(list(ngrams(x, 3)))
print(trigram)

# Named Entity Recognition
Example #59
0
def text_tokens(
    text,
    lower_bound_percentage=0,
    higher_bound_percentage=1,
    minimal_word_length=0,
    lower_case=False,
    remove_punctuations=False,
    remove_non_letter_characters=False,
    lemmatize_the_words=False,
    stemmer_the_words=False,
    add_pos_feature=False,
    url_filter=False,
    parentheses_filter=False,
    prime_s_filter=False,
    number_filter=False,
    part_of_speech_filter=False,
    english_text_filter=False,
    stop_words_filter=False,
    other_words_filter=False,
    remove_adjacent_tokens=False,
    tokens_form=True,
    stop_words=stop_words,
    some_other_words=some_other_words
):
    if lower_case:
        text = text.lower()
        #Englishtext = set(w.lower() for w in W.words())
    text = re.sub(r'\n', "", text)
    if url_filter:
        url_pattern = re.compile(
            r'((http:\/\/www\.|https:\/\/www\.|http:\/\/|https:\/\/)?[a-zA-Z0-9]+([\-\.]{1}[a-zA-Z0-9]+)*\.[a-zA-Z]{2,5}(:[0-9]{1,5})?(\/.*)?)'
        )
        text = re.sub(url_pattern, " ", text)
    if parentheses_filter:
        parentheses_pattern = re.compile(r'(\([^)]+\))')
        text = re.sub(parentheses_pattern, " ", text)
    if prime_s_filter:
        prime_s_pattern = r"('s|\?s)"
        text = re.sub(prime_s_pattern, "", text)
    if remove_punctuations:
        text = text.translate(str.maketrans('', '', string.punctuation))
    if remove_non_letter_characters:
        text = re.sub(r'[^a-zA-Z0-9]', " ", text)
    if number_filter:
        text = re.sub(r'[0-9]', " ", text)
    tokens = nltk.word_tokenize(text)
    howmany_tokens = len(tokens)
    if stop_words_filter:
        tokens = [token for token in tokens if token not in stop_words]
    tokens = tokens[int(howmany_tokens *
                        lower_bound_percentage):int(ceil(howmany_tokens * higher_bound_percentage))]
    if part_of_speech_filter:
        token_pos = nltk.pos_tag(tokens)
        tokens = [word for (word, pos) in token_pos if pos.startswith('N')]
    if add_pos_feature:
        token_pos = nltk.pos_tag(tokens)
        tokens = [word + '_' + pos for (word, pos) in token_pos]
    if english_text_filter:
        if add_pos_feature:
            tokens = [token for token in tokens if token.split('_')[0] in Englishtext]
        else:
            tokens = [token for token in tokens if token in Englishtext]
    if lemmatize_the_words:
        if add_pos_feature:
            tokens = [
                lemmatizer().lemmatize(token.split('_')[0]) + '_' + token.split('_')[1]
                for token in tokens
            ]
        else:
            tokens = [lemmatizer().lemmatize(token) for token in tokens]
        #stop_words = set([lemmatizer().lemmatize(word) for word in stopwords.words('english')])
        some_other_words = set([lemmatizer().lemmatize(word) for word in some_other_words])
    if stemmer_the_words:
        if add_pos_feature:
            tokens = [
                SnowballStemmer_().stem(token.split('_')[0]) + '_' + token.split('_')[1]
                for token in tokens
            ]
        else:
            tokens = [SnowballStemmer_().stem(token) for token in tokens]
        #stop_words = set([SnowballStemmer_().stem(word) for word in stopwords.words('english')])
        some_other_words = set([SnowballStemmer_().stem(word) for word in some_other_words])
    if add_pos_feature:
        tokens = [token for token in tokens if len(token.split('_')[0]) >= minimal_word_length]
    else:
        tokens = [token for token in tokens if len(token) >= minimal_word_length]
    if other_words_filter:
        if add_pos_feature:
            tokens = [token for token in tokens if token.split('_')[0] not in some_other_words]
        else:
            tokens = [token for token in tokens if token not in some_other_words]
    if remove_adjacent_tokens:
        remove_adjacent(tokens)
    if tokens_form:
        return tokens
    else:
        return ' '.join(tokens)
Example #60
0
import nltk
from nltk import word_tokenize, pos_tag, ne_chunk

# ===== POS Tagging and NER using NLTK =====

sent = '''Professor Tan Eng Chye, NUS Deputy President and Provost, and Professor 
Menahem Ben-Sasson, President of HUJ signed the joint degree agreement at NUS, 
in the presence of Ambassador of Israel to Singapore Her Excellency Amira Arnon 
and about 30 invited guests, on Sept 25, 2013.
'''

# The input for POS tagger needs to be tokenized first.
sent_pos = pos_tag(word_tokenize(sent))
sent_pos

# ===== NER using NLTK =====
# The input for the NE chunker needs to have POS tags.
sent_chunk = ne_chunk(sent_pos)
print(sent_chunk)

# ===== Now try creating your own named entity and noun phrase chunker ====
# We need to define the tag patterns to capture the target phrases and use
# RegexParser to chunk the input with those patterns.
# Some minimal tag patterns are given here.

grammar = r"""
  NE: {<NNP>+(<IN|CC|TO><NNP>)*}      # chunk sequences of proper nouns
  NP:                 
      {<DT|CD><JJ>?<NNS|NN>}     
"""