def __init__(self, text, product_name):
		self.candidate_features = []
		self.feature_sentences = []
		self.product_name = product_name.lower().split('-')[0].split('_')
		t = Tokenizer()
		sents = t.sent_tokenize(text.lower())
		p = POSTagger()
		wnl = WordNetLemmatizer()
		for sent in sents:
			tagged_sent = p.nltk_tag(t.word_tokenize(sent))
			feature_sent = {}
			feature_sent['sentence'] = sent
			feature_sent['tags'] = tagged_sent
			feature_sent['nouns'] = []
			feature_sent['noun_phrases'] = []
			for i in range(0, len(tagged_sent)):
				(word, tag) = tagged_sent[i]
				#Don't include proper nouns
				if tag.startswith('N') and tag != 'NNP':
					"""
					Consecutive nouns might form a feature phrase. Eg. Picture quality is a phrase.
					Meaningless phrases like 'quality digital' are removed later as their frequeny of occurence is	low. """
					if i > 0 and len(feature_sent['nouns']) > 0 and tagged_sent[i - 1][0] == feature_sent['nouns'][-1] and feature_sent['sentence'].find(feature_sent['nouns'][-1] + ' ' + word) > -1:
						feature_sent['noun_phrases'].append(wnl.lemmatize(feature_sent['nouns'].pop() + ' ' + word))
					else:
						feature_sent['nouns'].append(wnl.lemmatize(word))
					
			self.feature_sentences.append(feature_sent)
def returnKeywordFromList(convertpath):
    token_dict = {}
    i=0

    #nltk.download()
    wnl = WordNetLemmatizer()
    fileName = {}
    #print file
    #print str(i)+ file
    #file_path = subdir + os.path.sep + file
    shakes = open(convertpath, 'r')
    text = shakes.read()
    lowers = "".join(map(lambda l:l.decode('unicode_escape').encode('ascii','ignore'),text))
    no_punctuation = re.sub(r'[?|$|.|!0-9()=+-\/\'\"\|]',r'',lowers)
    d = {v:True for v in no_punctuation.split()}
    for token in d.keys():
        no_punctuation = no_punctuation.replace(token, wnl.lemmatize(token))
    fileName[i] = file
    token_dict[i] = no_punctuation.replace("\n"," ").replace("\r","")
    #break

    #this can take some time
    ##print token_dict.values()
    tfidf_vect = TfidfVectorizer(stop_words =stops, ngram_range=(1, 2))
    # #
    # count_vect.stop_words = stops
    #
    X_train_counts = tfidf_vect.fit_transform(token_dict.values())
    #print tfidf_vect.get_feature_names()
    #print(sortSparseMatrix(X_train_counts.getrow(0),rev=False, only_indices=False))
    sortedMatrix = sortSparseMatrix(X_train_counts.getrow(0),rev=True, only_indices=False)[0]
    x = map(lambda (x,y):x,sortedMatrix)
    result = getKeywordAlgorithms(1,sortedMatrix)
    return map(lambda key:tfidf_vect.get_feature_names()[key],result)
def text2sents(text, lemmatize=False, stemmer=None):
    """
    converts a text into a list of sentences consisted of normalized words
    :param text: list of string to process
    :param lemmatize: if true, words will be lemmatized, otherwise -- stemmed
    :param stemmer: stemmer to be used, if None, PortedStemmer is used. Only applyed if lemmatize==False
    :return: list of lists of words
    """
    sents = sent_tokenize(text)

    tokenizer = RegexpTokenizer(r'\w+')

    if lemmatize:
        normalizer = WordNetLemmatizer()
        tagger = PerceptronTagger()
    elif stemmer is None:
        normalizer = PorterStemmer()
    else:
        normalizer = stemmer

    sents_normalized = []

    for sent in sents:
        sent_tokenized = tokenizer.tokenize(sent)
        if lemmatize:
            sent_tagged = tagger.tag(sent_tokenized)
            sent_normalized = [normalizer.lemmatize(w[0], get_wordnet_pos(w[1])) for w in sent_tagged]
        else:
            sent_normalized = [normalizer.stem(w) for w in sent_tokenized]

        sents_normalized.append(sent_normalized)
    return sents_normalized
def lemmatizing(line_list):
    """
    Input: line_list (list of strings(sentences/documents)) - e.g. dataset.data

    Iterates over all terms in lines, lemmatize them using WordNetLemmatizer()

    Return: lemmatized_list (list of strings(terms that stemmed))
    """
    lemmatized_list = []
    lemmatizer = WordNetLemmatizer()
    for i, line in enumerate(line_list):
        # linercase
        line = line.lower()
        # remove punctuation
        # below method will simply remove punctuation, but mistakes such as amazon.com => amazoncom
        # nopunct_line = ''.join([c for c in line 
                                            # if re.match("[a-z\-\' \n\t]", c)])
        # this solve the problem above:
        nopunct_line = re.sub('[^A-Za-z0-9]+', ' ', line)                                            
        # tokenize
        line_token = wt(nopunct_line)
        # stemming
        lemmatized_line = []
        for term in line_token:
            term = lemmatizer.lemmatize(term)
            lemmatized_line.append(term)
        # back to sentence as a string
        lemmatized_sentence = ' '.join(lemmatized_line)
        lemmatized_list.append(lemmatized_sentence)
    return lemmatized_list
Esempio n. 5
0
def preprocess(original_str):
	# stemmer
	wnl = WordNetLemmatizer()
	# pos
	original_str = unicode(original_str, errors='ignore')
	print type(original_str)
	article_tok = pos_tag(word_tokenize(original_str))
	print type(article_tok)
	print "token: "
	print article_tok

	# choose Noun
	str_noun = ''
	for word, tag in article_tok:
		if ("NN" in tag) or ("JJ" in tag):
			# print(word,":",tag)
			# print(wnl.lemmatize(word))
			try:
				stemming_word = wnl.lemmatize(word)
				print stemming_word
				if len(word) > 1:
					str_noun = str_noun + stemming_word + " "
			except UnicodeDecodeError as e:
				print "error: " + word
			# end if



	# result
	# final_doc.append(str_noun)
	# print "return_preprocess : " + str_noun

	return str_noun
def lemmstem(sentences):
    ''' This function is responsible for perfoming 
        the lemmarization and stemming of the words
        Input: A list of trees containing the sentences.
                All words are classificated by their NE type
        Output: Lemmatized/Stemmized sentences
    '''
    
    lmtzr = WordNetLemmatizer()
    st = LancasterStemmer()
    
    dic = {'VB' :wordnet.VERB,
            'NN': wordnet.NOUN,
            'JJ':wordnet.ADJ,
            'RB':wordnet.ADV }
    
    for sent in sentences:
      
        lvsidx=sent.treepositions('leaves') 
       
        for pos in lvsidx:
            word=sent[pos][0]
            tag = sent[pos][1]
            rtag = tag[0:2]
            if rtag in dic:
                lemm=lmtzr.lemmatize( word, dic[rtag] )
                stem=st.stem(lemm)
                #print word, lemm, stem #Linia maldita
                sent[pos]=(word, tag, stem)
            else:
                sent[pos]=(word, tag, word)
    
    return sentences
def write_clean_turian_unigrams():
    """
    Extracts unigram embeddings from Socher's binary distribution. These can be used by other composers.

    There are only 50k embeddings (presumably for the most frequent tokens in the corpus). The words have not
    been processed- there are punctuation-only tokens, uppercased words and non-lemmatized words. There isn't
    any PoS tag filtering either- words like "to", "while" and "there".

    I remove punctuation, then lowercase and lemmatize each entry. Multiple entries may map to the
    same canonical form. I select the shortest original entry (ties are broken by giving preference to
    words that are already lowercased). This could have been done better.
    Only vectors for the selected entries are kept. There's 33k canonical
    forms left, many of which are not nouns/adjs/verbs.

    We don't have a PoS tag for the canonical forms. I get around the problem by creating 3 copies of each
    canonical form and expand "cat" to cat/N, cat/J and cat/V, which all share the same vector.
    """
    logging.info('Writing Turian unigrams to %s', turian_unigram_vectors_file)
    mat = loadmat(socher_unigram_embedding_matlab)
    words = [w[0] for w in mat['words'].ravel()]
    df = pd.DataFrame(mat['We'].T, index=words)

    lmtzr = WordNetLemmatizer()
    clean_to_dirty = defaultdict(list)  # canonical -> [non-canonical]
    dirty_to_clean = dict()  # non-canonical -> canonical
    to_keep = set()  # which non-canonical forms forms we will keep
    #  todo this can be done based on frequency or something

    for w in words:
        if set(w).intersection(set(string.punctuation).union(set('0123456789'))):
            # not a real word- contains digits or punctuation
            continue

        lemma = lmtzr.lemmatize(w.lower())
        clean_to_dirty[lemma].append(w)
        dirty_to_clean[w] = lemma

    # decide which of possibly many non-canonical forms with the same lemma to keep
    # prefer shorter and lowercased non-canonical forms
    for lemma, dirty_list in clean_to_dirty.items():
        if len(dirty_list) > 1:
            best_lemma = min(dirty_list, key=lambda w: (len(w), not w.islower()))
        else:
            best_lemma = dirty_list[0]
        to_keep.add(best_lemma)

    # remove non-canonical forms we don't want
    idx_to_drop = [i for i, w in enumerate(df.index) if w not in to_keep]
    ddf = df.drop(df.index[idx_to_drop])
    # canonicalize whatever is left
    ddf.index = [lmtzr.lemmatize(w.lower()) for w in ddf.index]

    # we don't know what the PoS tags of the canonical forms are, so make them all of the same tag
    # e.g. expand "cat" to cat/N, cat/J and cat/V, which all share the same vector
    new_index = ['%s/%s'%(w, pos) for pos in 'NJV' for w in ddf.index]
    new_data = np.vstack([ddf.values] * 3)
    ddf = pd.DataFrame(new_data, index= new_index)
    dv = DenseVectors(ddf, allow_lexical_overlap=True)
    dv.to_tsv(turian_unigram_vectors_file)
    logging.info('Done')
Esempio n. 8
0
    def init_feature_sentences(self, total_content):
        t = Tokenizer()
        p = POSTagger()
        wnl = WordNetLemmatizer()

        sentences = t.sent_tokenize(total_content.lower())

        for sentence in sentences:
            tagged_sentence = p.ntlk_tag(t.word_tokenize(sentence))

            #Initializing Feature Sentence dictionary
            feature_sentence = {}
            feature_sentence['sentence'] = sentence
            feature_sentence['tags'] = tagged_sentence
            feature_sentence['nouns'] = []
            feature_sentence['noun_phrases'] = []

            #Finding the Nouns/Noun Phrases in the tagged sentence
            for i in range(0,len(tagged_sentence)):
                (word, tag) = tagged_sentence[i]

                #Chunking
                if tag.startswith('N') and tag != 'NNP':
                    if i > 0 and len(feature_sentence['nouns']) > 0 and tagged_sentence[i - 1][0] == feature_sentence['nouns'][-1] and feature_sentence['sentence'].find(feature_sentence['nouns'][-1] + ' ' + word) > -1:
                        feature_sentence['noun_phrases'].append(wnl.lemmatize(feature_sentence['nouns'].pop() + ' ' + word))
                    else:
                        feature_sentence['nouns'].append(wnl.lemmatize(word))

            self.feature_sentences.append(feature_sentence)
Esempio n. 9
0
def feature_extractor_tripadvisor_top_words_weights(data):
    data = data.decode('utf-8')

    top_file = open('scraper/top_words.txt', 'r')
    top_words = [word.replace('\n', '') for word in top_file]
    places_file = open('scraper/places.txt', 'r')

    for place in places_file:
        place = place.replace('\n', '')
        for word in place.split(' '):
            if word != '-':
                top_words.append(word)

    features = {}
    lemmatizer = WordNetLemmatizer()
    stop_words = stopwords.words('english')

    words = [lemmatizer.lemmatize(word.lower()) for word in word_tokenize(data)]

    for word in words:
        if word not in stop_words:
            if word in features:
                if word in top_words:
                    features[word] += 1.5
                else:
                    features[word] += 1
            else:
                if word in top_words:
                    features[word] = 1.5
                else:
                    features[word] = 1

    return features
Esempio n. 10
0
def feature_extractor_top_words_weights(data):
    data = data.decode('utf-8')
    top_words = ['travel', 'vacation', 'city', 'itsmorefuninthephilippines', 'travel',
                 'boracay', 'philippine', 'view', 'day', 'beach', 'morning', 'resort', 
                 'good', 'cebu', 'island']
    features = {}
    lemmatizer = WordNetLemmatizer()
    stop_words = stopwords.words('english')

    words = [lemmatizer.lemmatize(word.lower()) for word in word_tokenize(data)]

    for word in words:
        if word not in stop_words:
            if word in features:
                if word in top_words:
                    features[word] += 1.5
                else:
                    features[word] += 1
            else:
                if word in top_words:
                    features[word] = 1.5
                else:
                    features[word] = 1

    return features
Esempio n. 11
0
def feature_extractor_top_words_weights(data):
    """
     Extract features using the top words with weights method
     parameter: data (tweet)
     returns: returns features of the given data
    """
    data = data.decode('utf-8')
    # top 15 frequently-ocurring words from the tourism-related twitter corpus
    top_words = ['travel', 'vacation', 'city', 'itsmorefuninthephilippines', 'travel',
                 'boracay', 'philippine', 'view', 'day', 'beach', 'morning', 'resort', 
                 'good', 'cebu', 'island']
    features = {}
    lemmatizer = WordNetLemmatizer()
    stop_words = stopwords.words('english')

    # preprocessing: tokenize, convert to lowercase and lemmatize words
    words = [lemmatizer.lemmatize(word.lower()) for word in word_tokenize(data)]

    # remove stop words and add words and their frequencies as features
    for word in words:
        if word not in stop_words:
            if word in features:
                # if word is found in the top words list, increase by 1.5 or preferred weight
                if word in top_words:
                    features[word] += 1.5
                else:
                    features[word] += 1
            else:
                if word in top_words:
                    features[word] = 1.5
                else:
                    features[word] = 1

    return features
Esempio n. 12
0
def Check(mArray):
  
  # what am I checking?
  item = mArray[1]
  lmtzr = WordNetLemmatizer()
  item = lmtzr.lemmatize(item)
  
  # converts to a string
  return ''.join(item)
Esempio n. 13
0
def word_extractor2(text):
	wordlemmatizer = WordNetLemmatizer()
	text = re.sub(r'([a-z])\1+', r'\1\1',text)#substitute multiple letter by two
	words = ""
	wordtokens = [ wordlemmatizer.lemmatize(word.lower()) \
	for word in word_tokenize(text.decode('utf-8', 'ignore')) ]
	for word in wordtokens:
		words+=" "+word
	return words
Esempio n. 14
0
def Check(mArray):

  #what am I checking?
  #Taking the 2nd item in the array since popopen puts the file path as the first item.
  item = mArray[1]
  lmtzr = WordNetLemmatizer()
  item = lmtzr.lemmatize(item, get_wordnet_pos(item))
    
  #converts to a string
  return ''.join(item)
def lemmatize(tokens): 
	# lemmatize words. try both noun and verb lemmatizations 
	lmtzr = WordNetLemmatizer() 
	for i in range(0,len(tokens)): 
		res = lmtzr.lemmatize(tokens[i]) 
		if res == tokens[i]: 
			tokens[i] = lmtzr.lemmatize(tokens[i], 'v') 
		else: 
			tokens[i] = res 
	return tokens
def add_lemmatizer():
    in_fp = open(word_topic_file)
    out_fp = open(word_topic_lexeme_file,  'w')
    wnl = WordNetLemmatizer()
    ###
    line = ''
    line_num = 0
    while 1 and line_num < max_line_num:
        line = in_fp.readline()
        line = line.strip()
        line_words = line.split(' ')
        line_write = ''
        for words in line_words:
            word_topic = words.split(':')
            word_id = word_topic[0]
            topic_id = word_topic[1]
            line_write += word_id
            line_write += ':'
            line_write += topic_id
            line_write += ':'
            ##
            if id_word_dict.has_key(word_id):
                word = id_word_dict[word_id]
                if word_lexeme_id_dict.has_key(word):
                    line_write += word_lexeme_id_dict[word]
                    line_write += ' '
                else:
                    word_list = []
                    word_list.append(word)
                    pos = pt(word_list)
                    tag = pos[0][1]
                    lexeme = wnl.lemmatize(word,  penn_to_wn(tag))
                    #print ': ', word,  lexeme
                    if word_id_dict.has_key(lexeme):
                        lexeme_id = word_id_dict[lexeme]
                        word_lexeme_id_dict[word] = lexeme_id
                        line_write += lexeme_id
                        line_write += ' '
                    else:
                        word_lexeme_id_dict[word] = word_id
                        line_write += word_id
                        line_write += ' '
                
            ##
        line_write = line_write.strip()
        out_fp.write(line_write)
        if line_num < max_line_num -1:
            out_fp.write('\n')
        line_num += 1
        if line_num%1000 ==0:
            print 'line: ', line_num
    ###
    in_fp.close()
    out_fp.close()
class Lemmatizer():
	def __init__(self):
		self.lemmatizer = WordNetLemmatizer()
		self.stemmer = SnowballStemmer("english", ignore_stopwords=True)

	'''
	Lemmatizes every word in a sentence and then tokenizes it.	
		sentence: str
	'''
	def lemmatize(self, sentence):
		tokens = word_tokenize(sentence)
		lemmas = self.lemmatizeTokens(tokens)
		return " ".join(lemmas)
		
	'''
	Turns phrase tokens into lemmatized tokens, which means into some standard format
	as determined by the nltk lemmatizer. "Dogs" to "dog", "went" to "go", etc.	 
		tokens: list of str
	'''
	def lemmatizeTokens(self, tokens):
		tokens_tagged = pos_tag(tokens)
		#Get simple POS tags.
		tokens_simpleTags = [(word, map_tag('en-ptb', 'universal', tag)) 
			for word, tag in tokens_tagged]
		
		#Actually lemmatize.
		lemmas = []
		for token, tag in tokens_simpleTags:
			lemmatized = ""
			if tag == "VERB":
				lemmatized = self.lemmatizer.lemmatize(token, pos='v')
			elif tag == "ADJ":
				lemmatized = self.lemmatizer.lemmatize(token, pos='a')
			elif tag == "ADV":
				lemmatized = self.lemmatizer.lemmatize(token, pos='r')
			else:
				lemmatized = self.lemmatizer.lemmatize(token) #pos = 'n'
			lemmas.append(lemmatized.encode("utf-8"))
		return lemmas

	'''
	Reduce this word down to its most basic form by removing suffixes or common ending
	and finding the "root" or "stem" of the word.

	Example: "response," "responsive," and "responsivity" all stem from "respons," or 
	something similar.
	'''
	def stem(self, tokens):
		stemmed = []
		for token in tokens:
			stem = self.stemmer.stem(token)
			stemmed.append(stem.encode("utf-8"))
		return stemmed
Esempio n. 18
0
def review_to_words(raw_review, need_to_lemmatize=False):
    # Function to convert a raw review to a string of words
    # optional lemmatization
    #
    meaningful_words = review_to_wordlist(raw_review)

    if need_to_lemmatize:
        wnl = WordNetLemmatizer()
        meaningful_words = [wnl.lemmatize(w) for w in meaningful_words]

    # 6. Join the words back into one string separated by space
    return " ".join(meaningful_words)
Esempio n. 19
0
def feature_extractor(data):
    data = data.decode('utf-8')
    features = {}
    lemmatizer = WordNetLemmatizer()
    stop_words = stopwords.words('english')

    words = [lemmatizer.lemmatize(word.lower()) for word in word_tokenize(data)]

    for word in words:
        if word not in stop_words:
            if word in features:
                features[word] += 1
            else:
                features[word] = 1

    return features
Esempio n. 20
0
	def __init__(self, lightweight=False):
		# self.sscol = WNGlossTag.read_all_glosstag(os.path.join(WORDNET_30_GLOSSTAG_PATH, 'merged'), verbose=True)
		if not lightweight:
			self.sscol = WNGlossTag.build_lelesk_data(os.path.join(WORDNET_30_GLOSSTAG_PATH, 'merged'), verbose=False)
		self.wnsql = WordNetSQL.get_default()
		self.wnl = WordNetLemmatizer()
		self.lemmatize_cache = dict()
Esempio n. 21
0
 def __init__(self):
     self.weights = [
     2.17985806e-01,
     6.01901694e-02,
     4.28099419e-01,
     0.14174161e-01,
     2.45876460e-01,
     2.19263225e-01,
     1.00816031e-01,
     1.06477027e-01,
     1.60378048e-03,
     5.79940520e-03,
     1.89163517e-02,
     1.68341118e-02,
     1.18885069e-01,
     2.68984406e-02,
     9.30754965e-03,
     1.78371552e-03,
     1.77288605e-03,
     2.37539365e-03,
     5.50162160e-05,
     1.10308137e-04,
     5.51531014e-05,
     5.35273441e-05,
     2.31964872e-01,
     1.68415302e-04,
     2.24946972e-01,
     ]
     self.lemmatizer = WordNetLemmatizer()
Esempio n. 22
0
 def __init__(self, lemmatize=True):
     self.debug = False
     self.stemmer = PorterStemmer()
     self.lemmatizer = WordNetLemmatizer()
     self.estLemmatizer = Lemmatizer(dictionary=lemmagen.DICTIONARY_ESTONIAN)
     self.lemmatize = lemmatize
     self.stopwords = self.get_stopwords()
 def __init__(self, stopwords=None, punct=None,
              lower=True, strip=True):
     self.lower = lower
     self.strip = strip
     #self.stopwords  = stopwords or set(sw.words('english'))
     self.punct = punct or set(string.punctuation)
     self.lemmatizer = WordNetLemmatizer()
Esempio n. 24
0
def get_words(document):
    '''
    Return a list of unique words in document
    '''
    regex1 = re.compile('\W')          # match non-alphanumeric
    regex2 = re.compile('&(#)*(\w)*;')  # match html entities
    regex3 = re.compile('( ){2,}')      # match more than 2 spaces
    lemmatizer = WordNetLemmatizer()
    tokenizer  = WhitespaceTokenizer()
    # lowercase document, remove punctuation, and html entities
    document   = regex3.sub(' ', regex2.sub(' ', regex1.sub(' ', document.lower())))
    words = [
             lemmatizer.lemmatize(word)
             for word in tokenizer.tokenize(document)
             if word not in STOPWORDS and len(word) > 2
            ]
    return FreqDist(words)
Esempio n. 25
0
def feature_extractor_tripadvisor_top_words_weights(data):
    """
     Extract features using the top words with weights 
     method using words from TripAdvisor
     parameter: data (tweet)
     returns: returns features of the given data
    """
    data = data.decode('utf-8')

    # retrieve file of top 100 frequently-occurring words from TripAdvisor comments
    top_file = open('classifier/top_words.txt', 'r')
    top_words = [word.replace('\n', '') for word in top_file]
    # retrieve file of 100 places from TripAdvisor
    places_file = open('classifier/places.txt', 'r')

    # clean places file
    for place in places_file:
        place = place.replace('\n', '')
        for word in place.split(' '):
            if word != '-':
                top_words.append(word)

    features = {}
    lemmatizer = WordNetLemmatizer()
    stop_words = stopwords.words('english')

    # preprocessing: tokenize, convert to lowercase and lemmatize words
    words = [lemmatizer.lemmatize(word.lower()) for word in word_tokenize(data)]

    # remove stop words and add words and their frequencies as features
    # if word is found in the top words list, increase by 1.5 or preferred weight
    for word in words:
        if word not in stop_words:
            if word in features:
                if word in top_words:
                    features[word] += 1.5
                else:
                    features[word] += 1
            else:
                if word in top_words:
                    features[word] = 1.5
                else:
                    features[word] = 1

    return features
Esempio n. 26
0
    def __word_cleaner(self, sentence):
        """ Removes the unwanted words in the sentence. """
        features = {}
        words = {}
        lematizer = WordNetLemmatizer()

        # get individual words from text
        words = [lematizer.lemmatize(word.lower()) for word in \
                 word_tokenize(sentence)]
        final_words = []

        for word in words:
            word = word.encode('utf-8', 'ignore')
            if len(word) > 1:
                # check if word in not a stop word
                if word not in stopwords.stop_words:
                    final_words.append(word)
        return ' '.join(final_words)
Esempio n. 27
0
 def _lemmatize_words(text):
     """Lemmatize all words in the text."""
     lemmatizer = WordNetLemmatizer()
     lemmatizations = {}
     tokens = text.split()
     for word in tokens:
         if word not in lemmatizations:
             lemmatizations[word] = lemmatizer.lemmatize(word)
     for i in xrange(5):  # Need to repeat several times to be safe
         tokens = text.split()
         for j in xrange(len(tokens)):
             try:
                 tokens[j] = lemmatizations[tokens[j]]
             except KeyError:
                 # During last pass, words were turned into their lemmas, which don't
                 # have entries in lemmatizations
                 pass
     text = ' '.join(tokens)
     return text
Esempio n. 28
0
def word_extractor2(text, sw):
	wordlemmatizer = WordNetLemmatizer()
	#Se obtienen stopwords del idioma ingles
	commonwords = stopwords.words('english')
	text = re.sub(r'([a-z])\1+', r'\1\1', text)
	words = ""
	#Se realiza lower-casing y lematizacion
	wordtokens = [wordlemmatizer.lemmatize(word.lower()) \
		     for word in word_tokenize(text.decode('utf-8', 'ignore'))]
	
	#Se eliminan tokens pertenecientes al conjunto de stopwords, en caso de que sw == True
	if sw == True:
		for word in wordtokens:
			if word not in commonwords:
				words += " " + word
	else:
		for word in wordtokens:
			words += " " + word	

	return words
Esempio n. 29
0
 def __init__(self):
     self.weights_sentences = np.array([
         3.48961282e-01,
         3.75654800e-01,
         4.12711607e-01,
         -7.24616082e-01,
         3.77362029e-02,
         1.15394180e-02,
         1.33443409e-02,
         1.64232249e-02,
         -3.36975735e-02,
         -5.02300279e-03,
         -3.17276960e-02,
         -2.94709012e-02,
         1.09211720e-03,
         -1.68436954e-02,
         7.09680460e-03,
         1.01815575e-03,
         -2.07404857e-02,
         -3.86330862e-02,
         1.66864534e-06,
         9.97633950e-04,
         7.88702336e-04,
         -1.04303582e-02,
         6.93624232e-02,
         7.89814727e-03
     ])
     self.weights_phrases = np.array([
         0.36460685,
         0.16974013,
         0.32817442,
         0.21123618,
         0.44617679,
         0.45049947,
         0.18118603,
         0.16519158,
         0.00473076,
         0.00340283,
         0.11341166,
         0.04393267,
         0.25306257,
         0.01741644,
         0.0228946,
         0.0,
         0.00326796,
         0.00490194,
         0.0,
         0.0,
         0.0,
         0.00160063,
         0.37955125,
         0.0
     ])
     self.lemmatizer = WordNetLemmatizer()
Esempio n. 30
0
def feature_extractor(d):
    features = {}
    words = {}
    lematizer = WordNetLemmatizer()

    # get individual words from text
    words = [lematizer.lemmatize(word.lower()) for word in word_tokenize(d)]

    for word in words:
        word = word.encode('utf-8', 'ignore')
        if len(word) > 1:
            # check if word in not a stop word
            if word not in stopwords.stop_words:
                # check if the word is not a url or @person
                if not re.match('http://.*|@.*', word):
                    if word in features:
                        features[word] += 1
                    else:
                        features[word] = 1
    return features
class NLTKPreprocessor(BaseEstimator, TransformerMixin):
    """
    Transforms input data by using NLTK tokenization, POS tagging, lemmatization and vectorization.
    """

    def __init__(self, corpus, max_sentence_len = 300, stopwords=None, punct=None, lower=True, strip=True):
        """
        Instantiates the preprocessor.
        """
        self.lower = lower
        self.strip = strip
        self.stopwords = set(stopwords) if stopwords else set(sw.words('english'))
        self.punct = set(punct) if punct else set(string.punctuation)
        self.lemmatizer = WordNetLemmatizer()
        self.corpus = corpus
        self.max_sentence_len = max_sentence_len

    def fit(self, X, y=None):
        """
        Fit simply returns self.
        """
        return self

    def inverse_transform(self, X):
        """
        No inverse transformation.
        """
        return X

    def transform(self, X):
        """
        Actually runs the preprocessing on each document.
        """
        output = np.array([(self.tokenize(doc)) for doc in X])
        return output

    def tokenize(self, document):
        """
        Returns a normalized, lemmatized list of tokens from a document by
        applying segmentation, tokenization, and part of speech tagging.
        Uses the part of speech tags to look up the lemma in WordNet, and returns the lowercase
        version of all the words, removing stopwords and punctuation.
        """
        lemmatized_tokens = []

        # Clean the text
        document = re.sub(r"[^A-Za-z0-9^,!.\/'+-=]", " ", document)
        document = re.sub(r"what's", "what is ", document)
        document = re.sub(r"\'s", " ", document)
        document = re.sub(r"\'ve", " have ", document)
        document = re.sub(r"can't", "cannot ", document)
        document = re.sub(r"n't", " not ", document)
        document = re.sub(r"i'm", "i am ", document)
        document = re.sub(r"\'re", " are ", document)
        document = re.sub(r"\'d", " would ", document)
        document = re.sub(r"\'ll", " will ", document)
        document = re.sub(r"(\d+)(k)", r"\g<1>000", document)

        # Break the document into sentences
        for sent in sent_tokenize(document):

            # Break the sentence into part of speech tagged tokens
            for token, tag in pos_tag(wordpunct_tokenize(sent)):

                # Apply preprocessing to the token
                token = token.lower() if self.lower else token
                token = token.strip() if self.strip else token
                token = token.strip('_') if self.strip else token
                token = token.strip('*') if self.strip else token

                # If punctuation or stopword, ignore token and continue
                if token in self.stopwords or all(char in self.punct for char in token):
                    continue

                # Lemmatize the token
                lemma = self.lemmatize(token, tag)
                lemmatized_tokens.append(lemma)

        doc = ' '.join(lemmatized_tokens)
        tokenized_document = self.vectorize(np.array(doc)[np.newaxis])
        return tokenized_document


    def vectorize(self, doc):
        """
        Returns a vectorized padded version of sequences.
        """
        save_path = "Data/padding.pickle"
        with open(save_path, 'rb') as f:
            tokenizer = pickle.load(f)
        doc_pad = tokenizer.texts_to_sequences(doc)
        doc_pad = pad_sequences(doc_pad, padding='pre', truncating='pre', maxlen=self.max_sentence_len)
        return np.squeeze(doc_pad)

    def lemmatize(self, token, tag):
        """
        Converts the Penn Treebank tag to a WordNet POS tag, then uses that
        tag to perform WordNet lemmatization.
        """
        tag = {
            'N': wn.NOUN,
            'V': wn.VERB,
            'R': wn.ADV,
            'J': wn.ADJ
        }.get(tag[0], wn.NOUN)

        return self.lemmatizer.lemmatize(token, tag)
Esempio n. 32
0
def lemmatize(word, pos):
    global lemmer
    if lemmer is None:
        lemmer = WordNetLemmatizer()

    return lemmer.lemmatize(word, get_wordnet_pos(pos))
Esempio n. 33
0
 def __init__(self):
     self.wnl = WordNetLemmatizer()
nltk.download('wordnet')
nltk.download('averaged_perceptron_tagger')

url = 'https://www.gutenberg.org/cache/epub/16370/pg16370.txt'
file = urllib.request.urlopen(url)
text = ''
for line in file:
    decoded_line = line.decode("utf-8")
    text = text + decoded_line
text = text.replace('\r\n', ' ')
for x in range(10):
    text = text.replace('  ', ' ')

#%% Pos tagging and lemmatization

wnl = WordNetLemmatizer()
sentences = sent_tokenize(text)
current_sentence = sentences[150]
tokens = word_tokenize(current_sentence)
tagged = pos_tag(tokens)

print(current_sentence)
for x in tagged:
    word = x[0]
    tag = x[1]
    if x[1].startswith('V'):
        result = wnl.lemmatize(word, pos='v')
        print(word, tag, result)

#%% Get all the adjectives in the text
all_adjectives = []
Esempio n. 35
0
# In[27]:

from nltk import WordNetLemmatizer, PorterStemmer, LancasterStemmer

# In[28]:

# Generate random embedding with same scale as glove
np.random.seed(SEED)
shape = (VOCAB_SIZE, EMBEDDING_SIZE)
scale = glove_embedding_weights.std() * np.sqrt(12) / 2
embedding = np.random.uniform(low=-scale, high=scale, size=shape)

# In[29]:

wnl = WordNetLemmatizer()
porter = PorterStemmer()
lancaster = LancasterStemmer()

# In[30]:

# Copy from glove weights of words that appear in index2word
count = 0
for i in range(1, VOCAB_SIZE):
    w = index2word[i]
    g = glove_index_dict.get(w)
    if g is None:
        w = wnl.lemmatize(w)
        g = glove_index_dict.get(w)
    if g is None:
        w = porter.stem(w)
Esempio n. 36
0
from nltk.corpus import brown, stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer, LancasterStemmer
from nltk import WordNetLemmatizer

# Name: Jesse Huss
# ID: 001209444
# Project: Assignment 1

stopWords = set(stopwords.words('english'))
wln = WordNetLemmatizer()
lemmatizer = WordNetLemmatizer()
porter = PorterStemmer()
lancaster = LancasterStemmer()

for cat in brown.categories():
    words = brown.words(categories=cat)
    noStopWords = [nsw for nsw in words if nsw not in stopWords]
    lemmatizedWords = [wln.lemmatize(lw) for lw in words]
    pstemmedWords = [porter.stem(psw) for psw in words]
    lstemmedWords = [lancaster.stem(lsw) for lsw in words]
    print(cat.upper() + ':')
    print('Word Tokens:\n' + str(len(words)) + ' vanilla.\n' +
          str(len(noStopWords)) + ' no stop words.\n' +
          str(len(lemmatizedWords)) + ' lemmatized.\n' +
          str(len(pstemmedWords)) + ' porter stemmed.\n' +
          str(len(lstemmedWords)) + ' lancaster scanned.\n')
    print('Word Types:\n' + str(len(set(words))) + ' vanilla.\n' +
          str(len(set(noStopWords))) + ' no stop words.\n' +
          str(len(set(lemmatizedWords))) + ' lemmatized.\n' +
          str(len(set(pstemmedWords))) + ' porter stemmed.\n' +
Esempio n. 37
0
# nltk.download() # To make sure all ntlk site packages are upto date and installed to get started with nltk
from nltk import PorterStemmer
from nltk import WordNetLemmatizer

paragraph = """Thank you all so very much. Thank you to the Academy. Thank you to all of you in this room. I have to congratulate the other incredible nominees this year. The Revenant was the product of the tireless efforts of an unbelievable cast and crew. First off, to my brother in this endeavor, Mr. Tom Hardy. Tom, your talent on screen can only be surpassed by your friendship off screen … thank you for creating a transcendent cinematic experience. Thank you to everybody at Fox and New Regency … my entire team. I have to thank everyone from the very onset of my career … To my parents; none of this would be possible without you. And to my friends, I love you dearly; you know who you are. And lastly, I just want to say this: Making The Revenant was about man's relationship to the natural world. A world that we collectively felt in 2015 as the hottest year in recorded history. Our production needed to move to the southern tip of this planet just to be able to find snow. Climate change is real, it is happening right now. It is the most urgent threat facing our entire species, and we need to work collectively together and stop procrastinating. We need to support leaders around the world who do not speak for the big polluters, but who speak for all of humanity, for the indigenous people of the world, for the billions and billions of underprivileged people out there who would be most affected by this. For our children’s children, and for those people out there whose voices have been drowned out by the politics of greed. I thank you all for this amazing award tonight. Let us not take this planet for granted. I do not take tonight for granted. Thank you so very much."""

## Tokenizing sentences
sentences = nltk.sent_tokenize(paragraph)
# print(sentences)

## Tokenizing words
# wordz = nltk.word_tokenize(paragraph)
# print(wordz)

# stemmer = PorterStemmer() # Creating an object of PorterStemmer class
lemmatizer = WordNetLemmatizer()  # Creating an object of PorterStemmer class

## Stemming
# for i in range(len(sentences)):
#     words = nltk.word_tokenize(sentences[i]) # Word Tokenization on sentences list.
#     stemmed_words = [stemmer.stem(word) for word in words] #List Comprehension usage and stemming each word of a single sentence at a time.
#     sentences[i] = ' '.join(stemmed_words) # Joining all stemmed words back into sentences using space delimiter and join function

# print(sentences)

## Lemmatization
for j in range(len(sentences)):
    words = nltk.word_tokenize(
        sentences[j])  # Word Tokenization on sentences list.
    lemmatized_words = [
        lemmatizer.lemmatize(word) for word in words
list_of_all_pos_tags=['ADJ' , 'ADP' , 'ADV' , 'AUX' , 'CCONJ' , 'DET' , 'INTJ' , 'NOUN' , 'NUM' , 'PART' , 'PRON' , 'PROPN' , 'PUNCT', 'SCONJ' , 'SYM' , 'VERB','X']

ref={}
for i in list_of_all_pos_tags:
	ref[i]=None
ref['AUX']='v'
ref['ADJ']='a'
ref['NOUN']='n'
ref['VERB']='v'
ref['ADV']='r'


import re
import nltk
from nltk import WordNetLemmatizer
wn_lemmatizer=WordNetLemmatizer();
# This functions returns wether ith character of string s is a consonant or not
def cons(s,i):
    if re.match('[aeiou]',s[i]):
        return False
    if re.match('y',s[i]):
        if i==0:
            return True
        else:
            return (not cons(s,i-1))
    return True

#This function return the measure of word or word part, (C)(VC)^m(C)
def m(s):
    m = 0
    for i in range(0, len(s) - 1):
Esempio n. 39
0
 def build_analyzer(self):
     lemm = WordNetLemmatizer()
     analyzer = super(LemmaCountVectorizer, self).build_analyzer()
     return lambda doc: (lemm.lemmatize(w) for w in analyzer(doc)
                         if (not doc.isdigit()) and len(doc) >= 3)
class MatchWordFeatures(QaTextFeautrizer):
    def __init__(self,
                 require_unique_match,
                 lemmatizer="word_net",
                 empty_question_features=False,
                 stop_words=None):
        self.lemmatizer = lemmatizer
        self.stop_words = stop_words
        self.empty_question_features = empty_question_features
        if lemmatizer == "word_net":
            self._lemmatizer = WordNetLemmatizer()
        else:
            raise ValueError()
        self._cache = {}
        self.require_unique_match = require_unique_match

    def n_context_features(self):
        return 3

    def n_question_features(self):
        return 3 if self.empty_question_features else 0

    def lemmatize_word(self, word):
        cur = self._cache.get(word)
        if cur is None:
            cur = self._lemmatizer.lemmatize(word)
            self._cache[word] = cur
        return cur

    def get_features(self, question, context):
        stop = set() if self.stop_words is None else self.stop_words.words
        context_features = np.zeros((len(context), 3))

        if not self.require_unique_match:
            question_words = set(x for x in question if x.lower() not in stop)
            quesiton_words_lower = set(x.lower() for x in question)
            quesiton_words_stem = set(
                self.lemmatize_word(x) for x in quesiton_words_lower)
        else:
            question_words = set(k for k, v in Counter(question).items()
                                 if v == 1)
            quesiton_words_lower = set(k for k, v in Counter(
                x.lower() for x in question_words).items() if v == 1)
            quesiton_words_stem = set(k for k, v in Counter(
                self.lemmatize_word(x) for x in quesiton_words_lower).items()
                                      if v == 1)

        for i, word in enumerate(context):
            if word in question_words:
                context_features[i][:3] = 1
            elif word.lower() in quesiton_words_lower:
                context_features[i][:2] = 1
            elif self._lemmatizer.lemmatize(word) in quesiton_words_stem:
                context_features[i][2] = 1

        if self.empty_question_features:
            return np.zeros((len(question), 3)), context_features
        else:
            return np.zeros((len(question), 0)), context_features

    def __setstate__(self, state):
        self.__init__(**state)

    def __getstate__(self):
        state = dict(self.__dict__)
        del state["_cache"]
        del state["_lemmatizer"]
        return state
Esempio n. 41
0
def tokenizer(data: DataFrame, rows, columns):
    tokenDict = dict() #"<entry>": (tf(overall), df, [list of docs it appears in])
    tokenDocs = dict()
    tokPostings = dict() #"<entry>": {docid: [tf in that doc, max_tf, doclen], ...}
    docInfo = dict()
    lematizer = WordNetLemmatizer()
    stopWords = set(stopwords.words("english"))

    for i in range(0, rows):
        tf = 1
        max_tf = 1
        doclen = 0
        docNo = i
        tokens1 = word_tokenize(data["Title"][i])
        tokens = list()
        #print(data["Text"][i])
        sentenceList = sent_tokenize(data["Text"][i])
        for sentence in sentenceList:
            tmp = word_tokenize(sentence)
            for t in tmp:
                tokens.append(t)

        #tokens = word_tokenize(sent_tokenize(data["Text"]))

        for t in tokens1:
            tokens.append(t)

        for tok in tokens:
            doclen += 1
            if tok in stopWords:
                continue
            word = lematizer.lemmatize(tok)
            if word in tokenDict:
                tokenDict[word] = tokenDict.get(word) + 1
                tokenDocs[word].add(docNo)
                # tokPostings[word].
            else:
                tokenDict[word] = 1
                tokenDocs[word] = {docNo}
                # tokPostings[word] = {docNo:1}
            if word in tokPostings:
                if docNo in tokPostings[word].keys():
                    tokPostings[word][docNo][0] = tokPostings[word][docNo][0] + 1
                    tf = tokPostings[word][docNo][0]
                    if tf > max_tf:
                        max_tf = tf
                else:
                    tokPostings[word][docNo] = [1, 0, 0]
            else:
                tokPostings[word] = {docNo: [1, 0, 0]}  # {docid: (tf,max_tf, doclen)}

        docInfo[docNo] = [max_tf, doclen]
        for word in tokPostings.keys():
            for doc in tokPostings[word]:
                tokPostings[word][int(doc)][1] = docInfo[int(doc)][0]
                tokPostings[word][int(doc)][2] = docInfo[int(doc)][1]
    sumOfDoclens = 0
    for doc in docInfo:
        sumOfDoclens += docInfo[doc][1]
    avgDoclen = sumOfDoclens / rows
    fullTokenDict = combineDicts(tokenDict, tokenDocs)  # combine dictionaries with same key set


    if fullTokenDict == -1:
        print("Failed in combining dictionaries")
        return
    # else:
    #     print(fullTokenDict)
    # print(tokenDict)
    # stemmedTokenDict, stemmedTokenDocs = stemmer(tokenDict)
    return fullTokenDict, tokPostings, avgDoclen
Esempio n. 42
0
from sklearn.cluster import KMeans
from sklearn.feature_extraction.text import TfidfVectorizer
from tqdm import tqdm
from nltk import RegexpTokenizer, WordNetLemmatizer
from nltk.corpus import stopwords
from pymongo import MongoClient

client = MongoClient("localhost", 27017)
messages = []
for post in client.pets.posts.find():
    messages.append(post['message'])
print("\nReceived data from Mongo...")
print(messages)
tokenizer = RegexpTokenizer(r"\w+")
lemmatizer = WordNetLemmatizer()
print("Processing words...")
processed = []
for message in tqdm(messages):
    tokens = [
        t for t in tokenizer.tokenize(str.lower(message))
        if t not in stopwords.words("english")
    ]
    if len(tokens) > 0:
        processed.append([lemmatizer.lemmatize(t) for t in tokens])
print("Processed...")
print(processed)
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform([y for x in processed for y in x])
clusters = 9
model = KMeans(n_clusters=clusters)
model.fit(X)
Esempio n. 43
0
 def __init__(self):
     super().__init__()
     self.lemmer = WordNet()
Esempio n. 44
0
import os
Esempio n. 45
0
"""单词的形态"""
from nltk import PorterStemmer, LancasterStemmer, RegexpStemmer, SnowballStemmer, WordNetLemmatizer

"""(英语?)词干提取器 nltk.stem"""
stemmer = PorterStemmer()
stemmer = LancasterStemmer()
stemmer = RegexpStemmer('ing')
out = [stemmer.stem('working'), stemmer.stem('happiness'), stemmer.stem('pairing')]
print(out)
print(SnowballStemmer.languages)
stemmer = SnowballStemmer('spanish')
out = stemmer.stem('comiendo')
print(out)
stemmer = SnowballStemmer('french')
out = stemmer.stem('manager')
print(out)
"""词形还原"""
lemmatizer = WordNetLemmatizer()
out = [lemmatizer.lemmatize('working'), lemmatizer.lemmatize('working', pos='v'), lemmatizer.lemmatize('works')]
print(out)

"""非英语单词提取器 安装polyglot词典"""
# print(downloader.supported_languages_table('morph2'))
Esempio n. 46
0
import re
import pandas as pd
import pickle
import string

from utils import flatten_nested_list
from collections import Counter
from nltk import WordNetLemmatizer
from nltk import word_tokenize
from symspellpy import SymSpell, Verbosity

# Initialize WordNet Lemmatizer
lemmatizer = WordNetLemmatizer()


class EntityPreprocessing:
    def __init__(self,
                 domain_dict,
                 n_thres=10,
                 ignore_article_counts=True,
                 n_spell_check_thres=5):
        """
        params:
        domain_dict: dictionary containing article pmcid (key) to predicted domain for article (value)
        n_thres: # of number of instances across corpus an entity must have to be included in final entity list
        ignore_article_counts: ignore the number of instances of an entity within an article
        ent_categories: dictionary containing a categorization of entities into pre-specified categories
        n_spell_check_thres: any entity with a number of instances across the corpus below this number is 
            spell-checked
        """
        self.domain_classifier = domain_dict
Esempio n. 47
0
class NLTKPreprocessor(BaseEstimator, TransformerMixin):
    """
    Transforms input data by using NLTK tokenization, lemmatization, and
    other normalization and filtering techniques.
    """
    def __init__(self, stopwords=None, punct=None, lower=True, strip=True):
        """
        Instantiates the preprocessor, which make load corpora, models, or do
        other time-intenstive NLTK data loading.
        """
        self.lower = lower
        self.strip = strip
        self.stopwords = set(stopwords) if stopwords else set(
            sw.words('english'))
        self.punct = set(punct) if punct else set(string.punctuation)
        self.lemmatizer = WordNetLemmatizer()

    def fit(self, X, y=None):
        """
        Fit simply returns self, no other information is needed.
        """
        return self

    def inverse_transform(self, X):
        """
        No inverse transformation
        """
        return X

    def transform(self, X):
        """
        Actually runs the preprocessing on each document.
        """
        return [list(self.tokenize(doc)) for doc in X]

    def tokenize(self, document):
        """
        Returns a normalized, lemmatized list of tokens from a document by
        applying segmentation (breaking into sentences), then word/punctuation
        tokenization, and finally part of speech tagging. It uses the part of
        speech tags to look up the lemma in WordNet, and returns the lowercase
        version of all the words, removing stopwords and punctuation.
        """
        # Break the document into sentences
        for sent in sent_tokenize(document):
            # Break the sentence into part of speech tagged tokens
            for token, tag in pos_tag(wordpunct_tokenize(sent)):
                # Apply preprocessing to the token
                token = token.lower() if self.lower else token
                token = token.strip() if self.strip else token
                token = token.strip('_') if self.strip else token
                token = token.strip('*') if self.strip else token

                # If punctuation or stopword, ignore token and continue
                if token in self.stopwords or all(char in self.punct
                                                  for char in token):
                    continue

                # Lemmatize the token and yield
                lemma = self.lemmatize(token, tag)
                yield lemma

    def lemmatize(self, token, tag):
        """
        Converts the Penn Treebank tag to a WordNet POS tag, then uses that
        tag to perform much more accurate WordNet lemmatization.
        """
        tag = {
            'N': wn.NOUN,
            'V': wn.VERB,
            'R': wn.ADV,
            'J': wn.ADJ
        }.get(tag[0], wn.NOUN)

        return self.lemmatizer.lemmatize(token, tag)
Esempio n. 48
0
def preprocess(sentence):
    lemmatizer = WordNetLemmatizer()
    return [lemmatizer.lemmatize(word.lower()) for word in word_tokenize(unicode(sentence, errors='ignore'))]
Esempio n. 49
0
def generate_part2_dict(ibex_data, unique_id):
    """Given an ibex results file, returns a dictionary of the following format --
    mystery word: [target, highest rated guess, lowest rated guess]"""
    Lemmy = WordNetLemmatizer()
    with open(ibex_data, 'rb+') as ibex_data:
        ibex_data = csv.reader(
            filter(lambda data_row: data_row[0] != '#', ibex_data))
        ibex_data = list(ibex_data)

        subject_id = unique_id
        subject_age = ibex_data[1][8]
        subject_sex = ibex_data[2][8]

        ibex_data = filter(lambda row: row[5] != 'end', ibex_data)
        ibex_data = filter(lambda row: row[5] != 'intro3', ibex_data)
        ibex_data = [[x.lower() for x in y] for y in ibex_data]
        subj_dict = {}
        guess_and_confidence = []

        previous_line = ['', '', '', '', '', '', '', '', '']
        trial_identifier = 5
        mystery_word, target_word, guess, confidence = 0, 1, 2, 2

        for current_line in ibex_data:
            # print "Current line:" + str(current_line)

            if current_line[trial_identifier] == previous_line[
                    trial_identifier]:
                # print "match"
                current_line_info = current_line[trial_identifier].split("_")
                previous_line_info = previous_line[trial_identifier].split("_")

                current_line_info = [x.lower() for x in current_line_info]
                previous_line_info = [x.lower() for x in previous_line_info]

                if (current_line_info[target_word],
                        current_line_info[mystery_word]) not in subj_dict:
                    subj_dict[(current_line_info[target_word],
                               current_line_info[mystery_word])] = [
                                   (previous_line[8], current_line[8])
                               ]
                else:
                    if (current_line_info[target_word],
                            current_line_info[mystery_word]) in subj_dict:
                        subj_dict[(current_line_info[target_word],
                                   current_line_info[mystery_word])] += [
                                       (previous_line[8], current_line[8])
                                   ]

            previous_line = current_line

        if len(subj_dict) != 12:
            raise ValueError(
                "ERROR: subj_dict does not equal 12. Check input results file")

        part_2_dict = defaultdict(list)

        # initialize a new dictionary for tracking some stats about the subject responses
        response_stats = defaultdict(list)

        for target_w_mystery_w, g_c_list in subj_dict.iteritems():

            g_c_reversed = reversed(g_c_list)
            g_c_reversed = list(g_c_reversed)
            guesses = []

            correct_answer_alternate_form = False
            for gc in g_c_reversed:
                lemmatized_guess = Lemmy.lemmatize(
                    gc[0].strip().decode('unicode_escape').encode(
                        'ascii', 'ignore'),
                    pos='n')
                lemmatized_guess = lemmatized_guess.encode('utf-8')

                for k, v in correct_answers.iteritems():
                    if lemmatized_guess in v:
                        correct_answer_alternate_form = lemmatized_guess
                        lemmatized_guess = k
                guesses.append((lemmatized_guess, gc[1]))

            guesses = [(x[0], int(x[1])) for x in guesses]
            # find if the target word was guessed during learning
            # and, find the highest confidence for that guess
            # and, find the number of times it was guessed
            target_guessed = 0
            target_highest_confidence = 'NA'
            target_n_times_guessed = 'NA'
            if correct_answer_alternate_form:
                target_guessed = 1
                target_highest_confidence = max(x[1] for x in guesses
                                                if x[0] == lemmatized_guess)
                target_n_times_guessed = sum(x[0] == lemmatized_guess
                                             for x in guesses)
            elif target_w_mystery_w[0] in [x[0] for x in g_c_reversed]:
                target_guessed = 1
                target_highest_confidence = max(
                    x[1] for x in guesses if x[0] == target_w_mystery_w[0])
                target_n_times_guessed = sum(x[0] == target_w_mystery_w[0]
                                             for x in guesses)

            response_stats[target_w_mystery_w[0]] = [
                target_guessed, target_highest_confidence,
                target_n_times_guessed
            ]

            guesses = [gc for gc in guesses if gc[0] != target_w_mystery_w[0]]

            if not guesses:
                guesses = [
                    (random.choice(frequent_words), random.randint(1, 5)),
                    (random.choice(frequent_words), random.randint(1, 5)),
                    (random.choice(frequent_words), random.randint(1, 5))
                ]

            highest_confidence = max(x[1] for x in guesses)
            lowest_confidence = min(x[1] for x in guesses)

            highest_guesses = map(
                lambda x: x if x[1] >= highest_confidence else None, guesses)
            lowest_guesses = map(
                lambda x: x if x[1] <= lowest_confidence else None, guesses)

            highest_guesses = (x for x in highest_guesses if x is not None)
            lowest_guesses = (x for x in lowest_guesses if x is not None)

            highest_guess = next(highest_guesses, None)
            lowest_guess = next(lowest_guesses, None)

            highest_guess = highest_guess[0]
            lowest_guess = lowest_guess[0]

            if highest_guess == lowest_guess:
                # print "high-low match"
                lowest_guess = next(lowest_guesses, None)
                lowest_guess = lowest_guess[0] if type(
                    lowest_guess) is tuple else None

            highest_guessed = 0
            highest_guess_highest_confidence = 'NA'
            highest_guess_n_times_guessed = 'NA'
            lowest_guessed = 0
            lowest_guess_highest_confidence = 'NA'
            lowest_guess_n_times_guessed = 'NA'

            if highest_guess in [x[0] for x in guesses]:
                highest_guessed = 1
                highest_guess_highest_confidence = max(
                    x[1] for x in guesses if x[0] == highest_guess)
                highest_guess_n_times_guessed = sum(x[0] == highest_guess
                                                    for x in guesses)

            if lowest_guess in [x[0] for x in guesses]:
                lowest_guessed = 1
                lowest_guess_highest_confidence = max(x[1] for x in guesses
                                                      if x[0] == lowest_guess)
                lowest_guess_n_times_guessed = sum(x[0] == lowest_guess
                                                   for x in guesses)

            response_stats[highest_guess] = [
                highest_guessed, highest_guess_highest_confidence,
                highest_guess_n_times_guessed
            ]
            response_stats[lowest_guess] = [
                lowest_guessed, lowest_guess_highest_confidence,
                lowest_guess_n_times_guessed
            ]
            response_stats['distractor'] = [0, 'NA', 'NA']

            target_word = correct_answer_alternate_form if correct_answer_alternate_form else target_w_mystery_w[
                0]
            part_2_dict[target_w_mystery_w[1]] = [
                target_word, highest_guess, lowest_guess
            ]
        # print subject_id

        return [part_2_dict, response_stats]
class DynammicClustering:
    """""" """""" """
    Initializing NER model and files input
    """ """""" """"""
    gateway = JavaGateway()  # connect to the JVM

    def __init__(self,
                 a=float(1 / 11),
                 b=float(5 / 11),
                 c=float(1 / 11),
                 d=float(4 / 11),
                 threshold=0.16,
                 threshold2=0.3,
                 threshold3=.4,
                 inputFile="../Data/mytweet02.csv",
                 Outfilename="../MyOutputs/clustersIds.csv"):
        """""" """""" """""" """
        similarity score parameters a,b,c,d
        """ """""" """""" """"""
        self.a = a  # commen Noun
        self.b = b  # properNoun
        self.c = c  # verb
        self.d = d  # hashtag threshold
        self.threshold = threshold
        self.threshold2 = threshold2
        self.threshold3 = threshold3
        self.inputFile = inputFile
        self.clusterfile = Outfilename
        """""
        Inputfile reading
        """
        self.Alltweets = pd.read_csv(inputFile, ",")
        data = []
        data.insert(
            0, {
                'id': 1234567890,
                'created_at': "Mon Apr 01 02:59:33 +0000 2019",
                'text':
                "@awesome_lucky Congrats to Mo Yan for being the 1st Chinese Nobel Prize of Literature laureate!",
                'user': "******",
                'retweet_count': 5
            })

        self.Alltweets = pd.concat([pd.DataFrame(data), self.Alltweets],
                                   ignore_index=True,
                                   sort=False)
        output = open(Outfilename, mode='wt', encoding='utf-8')
        fieldnames = ['clusterno', 'tweetd']
        self.writer = csv.DictWriter(output,
                                     fieldnames=fieldnames,
                                     quoting=csv.QUOTE_MINIMAL)
        self.writer.writeheader()

        output1 = open("../MyOutputs/slang2.csv", mode='wt', encoding='utf-8')
        fieldnames = ['id', 'slangs']
        self.slang_writer = csv.DictWriter(output1,
                                           fieldnames=fieldnames,
                                           quoting=csv.QUOTE_MINIMAL)
        self.slang_writer.writeheader()
        self.lmtz = WordNetLemmatizer()
        self.MergeCache = defaultdict(MergeCluster)
        self.UntiClusters = defaultdict(cluster)

    def mySimilarityFun(self, x, java_object, r):

        return x.similarity(java_object, r, self.a, self.b, self.c, self.d)

    def tweetPrecos(self, textWord):
        _str = re.sub('[^a-zA-Z0-9-_.]', '', textWord)
        # Check if selected word matches short forms[LHS] in text file.
        if _str.upper() in abbrRemov.keys():
            # If match found replace it with its appropriate phrase in text file.
            _str = abbrRemov[_str.upper()]
        return _str

    def tweets_to_clusters(self, inputfile, clustersFile):
        x = pd.read_csv(inputfile, ',')

        y = pd.read_csv(clustersFile, ',')
        z = pd.read_csv("../MyOutputs/slang2.csv",
                        sep=',',
                        quotechar='"',
                        converters={1: ast.literal_eval})

        tweets = {}
        merged = pd.merge(y, x, left_on='tweetd', right_on='id')
        merged = pd.merge(merged, z, left_on='id', right_on='id')
        col = ['tweets', 'clusterID']
        df = pd.DataFrame(columns=col, index=None)
        df['tweets'] = merged['text'].apply(lambda x: self.tweet_clean(
            x.lower(), merged.loc[merged['text'] == x, 'slangs'].iloc[0]))

        df['tweets'].replace('', numpy.nan, inplace=True)
        df.dropna(subset=['tweets'], inplace=True)
        df['clusterID'] = merged['clusterno']
        df.to_csv('../MyOutputs/clusters.csv')
        df['tweets'] = merged['text']
        df.to_csv('../MyOutputs/Cleaned.csv')

    #abbr removed
    def translator(self, user_string):
        # Check if selected word matches short forms[LHS] in text file.
        if user_string.upper() in abbrRemov.keys():
            # If match found replace it with its appropriate phrase in text file.
            user_string = abbrRemov[user_string.upper()]
        return user_string

    """""" """""" """"
      preproceessing variable and functions
      """ ""

    def tweet_clean(self, t, words):
        #cut out formating of new lines
        t = t.replace('\n', " ").replace('\r', " ")
        #for each word that we identified as removeable like emoji we remove it
        for ele in words:
            if len(ele) >= 2:
                t = t.replace(ele + " ", " ")
        #remove mentions
        t = re.sub('@[^\s]+', '', t)
        #remove urls
        t = re.sub(
            r'''(?i)\b((?:https?://|www\d{0,3}[.]|[a-z0-9.\-]+[.][a-z]{2,4}/)(?:[^\s()<>]+|\(([^\s()<>]+|(\([^\s()<>]+\)))*\))+(?:\(([^\s()<>]+|(\([^\s()<>]+\)))*\)|[^\s`!()\[\]{};:'".,<>?«»“”‘’]))''',
            " ", t)
        #remove wspace
        t = re.sub(r"[^\w\s]", "", t)

        if (t.isspace()):
            return None

        return t

    """""
    
    performs Part of the speech tagging
    
    input: text to parse and tweet id
    out: Default dict of tags
    """

    def NERPass(self, text, id):
        Preprocessed = defaultdict()
        java_object = DynammicClustering.gateway.entry_point.getStack(
            text.lower())  # return {A: "adjective list",N:"nouns list....}
        keysToMatch = {'#', 'V', 'T'}
        nounKeys = {'N', '^', 'Z', 'M', 'S'}
        removeables = {'!', '~', 'G', 'E', '#'}
        slangs = []
        if 'U' in java_object:
            #lematization does better preprocessing for more matches
            Preprocessed['U'] = mset(
                self.lmtz.lemmatize(word, 'v')
                for word in re.split(" ", java_object['U']))
        #match { '#', 'V', 'T'}
        for key in keysToMatch:
            if (key in java_object):
                Preprocessed[key] = mset(
                    self.lmtz.lemmatize(word, 'v')
                    for word in re.split(" ", java_object[key]))
        sett = None
        #match Nouns (all kinds in here)
        for key in nounKeys:
            if (key in java_object):
                if sett is None:
                    sett = mset(
                        self.lmtz.lemmatize(word, 'v')
                        for word in re.split(" ", java_object[key]))
                else:
                    sett = sett | mset(
                        self.lmtz.lemmatize(word, 'v')
                        for word in re.split(" ", java_object[key]))

        if sett != None:
            Preprocessed['N'] = sett
        sett = None
        #slangs and emotican in the tweets
        for key in removeables:
            if (key in java_object):
                if sett is None:
                    sett = set(
                        self.lmtz.lemmatize(word, 'v')
                        for word in re.split(" ", java_object[key]))
                else:
                    sett = sett | set(
                        self.lmtz.lemmatize(word, 'v')
                        for word in re.split(" ", java_object[key]))

        if sett != None:
            self.slang_writer.writerow({'id': id, 'slangs': list(sett)})

        return Preprocessed

    def MergeClusters(self, unitCluster):
        score = []
        if len(self.MergeCache) == 0:
            cno1 = len(self.MergeCache)
            self.MergeCache[cno1] = MergeCluster(cno1)
            self.MergeCache[cno1].Extend(unitCluster)
            return
        score = list(
            map(
                lambda x: self.MergeCache[x].similarity(
                    unitCluster, self.a, self.b, self.c, self.d),
                self.MergeCache.keys()))
        score = sorted(score, key=self.takeSecond, reverse=True)
        # print(score)   # print("score"+str(score[0]))
        if (score[0][1] > self.threshold2):
            self.MergeCache[score[0][0]].Extend(unitCluster)
        else:  #new event
            cno1 = len(self.MergeCache)
            self.MergeCache[cno1] = MergeCluster(cno1)
            self.MergeCache[cno1].Extend(unitCluster)

    """""" """
    Alltweet is a pd dataframe
    id|text|username|timestamp
      |    |        |       
    
    """ """"""

    def takeSecond(self, elem):
        return elem[1]

    def theLastMerge(self):
        # k = 1
        # lenght = len(self.MergeCache)
        # deactivated = []
        # for i in range(len(self.MergeCache)):
        #     score = []
        #     for cluster in range(i+1,len(self.MergeCache)):
        #         if (cluster not in deactivated and cluster != i):
        #             scorr = self.MergeCache[cluster].similarity(self.MergeCache[i],self.a,self.b,self.c,self.d)
        #             if (scorr[1] > .8):
        #                 self.MergeCache[cluster].Extend(self.MergeCache[i])
        #                 deactivated.append(i)
        #                 break
        #             else:
        #                 score.append(scorr)
        #     if (i not in deactivated):
        #         score = sorted(score, key=self.takeSecond, reverse=True)
        #         if (score[0][1] > self.threshold3):
        #             self.MergeCache[score[0][0]].Extend(self.MergeCache[i])
        #             deactivated.append(i)
        # xcnn=0
        # for cluster in self.MergeCache:
        #     if cluster not in deactivated:
        #         for td in self.MergeCache[cluster].ids:
        #             xcnn+=1
        #             self.writer.writerow({'clusterno': cluster, 'tweetd': td})
        for cluster in self.MergeCache:
            for td in self.MergeCache[cluster].ids:
                self.writer.writerow({'clusterno': cluster, 'tweetd': td})
Esempio n. 51
0
ptreeTEST1 = ParentedTree.convert(tree)
ptreeTEST = ParentedTree('S', [
    ParentedTree('NP', [ParentedTree('PRP', ['She'])]),
    ParentedTree('VP', [
        ParentedTree('VBD', ['was']),
        ParentedTree('RB', ['not']),
        ParentedTree('VP', [ParentedTree('VBD', ['admired'])])
    ]),
    ParentedTree('.', ['.'])
])

for i in range(len(sentList[0])):
    print(i, sentList[0][i])

from nltk import WordNetLemmatizer
wordnet_lemmatizer = WordNetLemmatizer()
#wordnet_lemmatizer.lemmatize('creating','v')    # u'create'
#tokens = nltk.word_tokenize(test1)


def get_wordnet_pos(treebank_tag):
    if treebank_tag[0] == 'J':
        return wordnet.ADJ
    elif treebank_tag[0] == 'V':
        return wordnet.VERB
    elif treebank_tag[0] == 'N':
        return wordnet.NOUN
    elif treebank_tag == 'R':
        return wordnet.ADV
    else:
        return ''
Esempio n. 52
0
from nltk.corpus import stopwords
from pymongo import MongoClient

reviews_collection = MongoClient(
    "mongodb://localhost:27017/")["Dataset_Challenge_Reviews"]["Reviews"]
business_collection = MongoClient(
    "mongodb://localhost:27017/")["Dataset_Challenge_Reviews"]["Business"]
corpus_collection = MongoClient(
    "mongodb://localhost:27017/")["Dataset_Challenge_Reviews"]["Corpus"]

stopset = set(stopwords.words('english'))
stopwords = {}
with open('stopwords.txt', 'rU') as f:
    for line in f:
        stopwords[line.strip()] = 1
lmtzr = WordNetLemmatizer()

with open(
        '../yelp_dataset_challenge_academic_dataset/yelp_academic_dataset_business.json'
) as dataset:
    for line in dataset:
        data = json.loads(line)
        if 'Restaurants' in data["categories"] and data['city'] == 'Phoenix':
            business_collection.insert({"_id": data["business_id"]})

n = 0
with open(
        '../yelp_dataset_challenge_academic_dataset/yelp_academic_dataset_review.json'
) as dataset:
    for line in dataset:
        data = json.loads(line)
Esempio n. 53
0
 def __init__(self, model):
     self.model = model
     self.lemmatizer = WordNetLemmatizer()
     self.intents = json.loads(open('intents.json').read())
     self.words = pickle.load(open('words.pkl', 'rb'))
     self.classes = pickle.load(open('classes.pkl', 'rb'))
Esempio n. 54
0
 def __init__(self, stopwords=None, punct=None, lower=True, strip=True):
     self.lower = lower
     self.strip = strip
     self.stopwords = stopwords or set(sw.words('english'))
     self.punct = punct or set(string.punctuation)
     self.lemmatizer = WordNetLemmatizer()
Esempio n. 55
0
class LemmaTokenizer(object):
    def __init__(self):
        self.wnl = WordNetLemmatizer()

    def __call__(self, doc):
        return [self.wnl.lemmatize(t) for t in word_tokenize(doc)]
Esempio n. 56
0
 def preprocess(text):
     lemmatizer = WordNetLemmatizer()
     return [
         lemmatizer.lemmatize(word.lower())
         for word in word_tokenize(str(text))
     ]
Esempio n. 57
0
stop = stopwords.words('english')

# Code from https://www.kaggle.com/pjoshi15/so-many-outfits-so-little-time-word2vec
reviews_df['Review_Tidy'] = reviews_df['Review_Tidy'].apply(
    lambda x: ' '.join([word for word in x.split() if word not in (stop)]))

# Combining title and review
reviews_df["Review_Tidy"] = reviews_df["Title"].map(
    str) + " " + reviews_df["Review_Tidy"]

# Lementizing
# https://pythonprogramming.net/lemmatizing-nltk-tutorial/
print("Lemmatizing....")
from nltk import WordNetLemmatizer

lemmatizer = WordNetLemmatizer()
for review in reviews_df['Review_Tidy']:
    for word in review:
        word = lemmatizer.lemmatize(word)

# Remove Repeated Characters
print("Removing repeated Characters....")
import re

for review in reviews_df['Review_Tidy']:
    for word in review:
        word = re.sub(r'(.)\1+', r'\1\1', word)

# Covert to lowercase
print("Converting words to lowercase...")
reviews_df['Review_Tidy'] = reviews_df['Review_Tidy'].str.lower()
Esempio n. 58
0
class Preprocessor(BaseEstimator, TransformerMixin):
    
    def __init__(self, flag, flag1, stem, stopwords=None, punct=None, lower=True, strip=True):
        self.flag = flag
        self.flag1 = flag1
        self.lemmatizer = WordNetLemmatizer()
        self.stemmer    = stem
        self.lower      = lower
        self.strip      = strip
        self.stopwords  = stopwords or set(sw.words('english'))
        self.punct      = set(punct) if punct else set(string.punctuation)
        
    def fit(self, X, y=None):
        #print('fit')
        return self

    def inverse_transform(self, X):
        #print('inverse_transform')
        return X

    def transform(self, X):
        #print('transform: ', len(X))
        return [
            self.tokenize(sent) for sent in X
        ]
    
    def tokenize(self, sentenses):
        '''
        sentenses = sentenses.lower()
        sentenses = sentenses.strip()
        for stop in stop_words:
            if sentenses.find(stop)!=-1: 
                self.replace_stop_words(sentenses)
                break
        '''
        res = ''
        for token, tag in pos_tag(wordpunct_tokenize(sentenses)):
            token = token.lower() if self.lower else token
            token = token.strip() if self.strip else token
            token = token.strip('_') if self.strip else token
            token = token.strip('*') if self.strip else token

            # If punctuation or stopword, ignore token and continue
            if self.flag1 == 0:
                if token in self.stopwords or all(char in self.punct for char in token):
                    continue
            else:
                if all(char in self.punct for char in token):
                    continue
                if token in self.stopwords:
                    token = self.replace_stop_words(token)

            # Lemmatize or stemming the token and yield
            if self.flag == 0:
                lemma = token
            elif self.flag == 1:
                lemma = self.lemmatize(token, tag)
            elif self.flag == 2:
                lemma = self.stemmer.stem(token)
            res += lemma + ' '
        res = res.strip()
        return res
        

    def lemmatize(self, token, tag):
        """
        Converts the Penn Treebank tag to a WordNet POS tag, then uses that
        tag to perform much more accurate WordNet lemmatization.
        """
        tag = {
            'N': wn.NOUN,
            'V': wn.VERB,
            'R': wn.ADV,
            'J': wn.ADJ
        }.get(tag[0], wn.NOUN)

        return self.lemmatizer.lemmatize(token, tag)
    
    def stemmer(self, token):
        
        return self.stem.stem(token)
  
    def replace_stop_words(self, token):
        for stop in stop_words:
            if token.find(stop) != -1:
                if stop.find('n\'t') != -1:
                    ind2 = stop.find('n')
                    token = stop[0:ind2] + ' not'
                    break
                else:
                    token = ' be'
                    break
        return token
Esempio n. 59
0
            slot.append(None)

    return tokens, slot


def collide(l1, l2):
    """
    Detect whether l1 and l2 have common elements.
    :param list l1: List 1.
    :param list l2: List 2.
    :rtype: bool
    """
    return len(set(l1).intersection(l2)) > 0


wnl = WordNetLemmatizer()


def lemmatize(word):
    """
    Helper function of convert.
    :param str word: word to convert.
    :rtype: str
    """
    if word.endswith('ly'):
        word = word[:-2]
    word = wnl.lemmatize(word, 'v')
    word = wnl.lemmatize(word, 'n')
    word = wnl.lemmatize(word, 'a')
    word = wnl.lemmatize(word, 's')
    return word
Esempio n. 60
0
 def lemmatizing(self, text):
     wl = WordNetLemmatizer()
     return [wl.lemmatize(word) for word in text]