Ejemplo n.º 1
0
def get_cooc(chunk_trees,stoplist=True):
  triples, simple_trees = [], []
  lmtzr = WordNetLemmatizer()
  for t in chunk_trees:
    entities = []
    for chunk in t[:]:
      if isinstance(chunk,Tree) and chunk.node == 'NP':
        # getting a tree for later processing of triples from the simple noun 
        # phrases (if present)
        simple_trees.append(parser_smp.parse(chunk.leaves()))
        words = []
        for word, tag in chunk[:]:
          # stem/discard elements and construct an argument
          if (stoplist and word in STOPLIST) or \
          (len([x for x in word if x.isalnum()]) == 0):
            # do not process stopwords for simple trees, do not process purely 
            # non alphanumeric characters
            continue
          if tag.startswith('N'):
            words.append(lmtzr.lemmatize(word,'n'))
          elif tag.startswith('J'):
            words.append(lmtzr.lemmatize(word,'a'))
          else:
            words.append(word)
        if len(words) > 0:
          entities.append(SEP.join(words))
    for e1, e2 in combinations(entities,2):
      triples.append((e1,util.COOC_RELNAME,e2))
      triples.append((e2,util.COOC_RELNAME,e1))
  return triples, simple_trees
Ejemplo n.º 2
0
def MakeLemmaList(tagged):
    # n noun
    # v verb
    # a adje
    # r adverb
    # m,w,.. something else

    noun_op, adj_op, adv_op, verb_op, other_op = [], [], [], [], []

    lm = WordNetLemmatizer()
    for i in tagged:
        # print i, i[0], i[1][0:2]
        if cmp(i[1][0:1], "N") == 0:
            noun_op.append(lm.lemmatize(i[0], "n"))
        elif cmp(i[1][0:1], "V") == 0:
            asd = lm.lemmatize(i[0], "v")
            if asd != "be" and asd != "have" and asd != "do" and asd != "done" and asd != "should":
                verb_op.append(asd)
        elif cmp(i[1][0:1], "J") == 0:
            adj_op.append(lm.lemmatize(i[0], "a"))
        elif cmp(i[1][0:1], "R") == 0:
            adv_op.append(lm.lemmatize(i[0], "r"))
        else:
            # print lm.lemmatize(i[0])+ " "
            pass
    final_op = noun_op + verb_op + other_op + adj_op + adv_op
    return final_op
Ejemplo n.º 3
0
def decompose(text, keepOriginal):
    if text:
        # Case-folding
        text = text.lower();
        
        # Expand all contractions like "isn't" to "is not"
        text = expandContractions(text);
        
        # Remove punctuation
        regex = re.compile('[%s]' % re.escape(string.punctuation))
        text = regex.sub('', text)
        
        # Remove stop words (just add words to the list you think also have to be removed)
        stopWords = ['the','this','that','those','these','to','as','there','has','and','or',
                     'is','not','a','an','of','but','in','by','on','are','it','if'];
        words = text.split();
        text = ' '.join([i for i in words if i not in stopWords]);
        
        # Lemmatization
        lemmatizer = WordNetLemmatizer();
        words = text.split();
        if keepOriginal:
            text = ' '.join([i + " " + lemmatizer.lemmatize(i) for i in words]);
        else:            
            text = ' '.join([lemmatizer.lemmatize(i) for i in words]);
        
        # Remove duplicate words
        text = ' '.join(OrderedDict((word,word) for word in text.split()).keys());
    return text
def stemming():
    lmtzr = WordNetLemmatizer()
    with open('date_gone.out', 'rb') as fin:
        with open('stemmed.out', 'w') as fout:
            i = 0
            for line in fin:
                #i+=1
                new_data = []
                row = line.split('\t')
                #print(i)
                l = len(row)
                if l > 5:
                    data = row[5]
                    words = data.split(' ')
                    for word in words:
                        new_word = lmtzr.lemmatize(word)
                        new_data.append(new_word)
                    row[5] = ' '.join(new_data)
                if l > 6:
                    data = row[6]
                    words = data.split(' ')
                    for word in words:
                        new_word = lmtzr.lemmatize(word)
                        new_data.append(new_word)
                    row[6] = ' '.join(new_data)
                fout.write('\t'.join(row))
def firstDef(mwe,definition):
    # this is the approach of using only the first definition
    if definition=='':
        return([1,1])
    definition = definition.split('\n')[0]
    definition = definition.replace(mwe,'')
    definition = definition.replace('(','')
    definition = definition.replace(')','')
    tokenizer = RegexpTokenizer('\w+|\$[\d\.]+|\S+')
    defineArr = tokenizer.tokenize(definition)
    
    lmtzr = WordNetLemmatizer()
    for i in range(0,len(defineArr)):
        defineArr[i] = lmtzr.lemmatize(defineArr[i])

        
    words = mwe.split()
    for i in range(0,len(words)):
        words[i] = lmtzr.lemmatize(words[i])
    



    if words[0] in defineArr and words[1] in defineArr:
        return([1,1])
        
    elif words[0] in defineArr:
        return([1,0])
        
    elif words[1] in defineArr:
        return([0,1])
    else:
        return([0,0])
Ejemplo n.º 6
0
Archivo: svm.py Proyecto: Chunpai/cs200
def convert_speeches_into_matrix(features,speech_list,label):    
    sample_matrix = []
    label_vector  = []
    #print len(features)
    for speech in speech_list:
        sample = []
        speech = re.sub('http://[a-zA-Z0-9|/|.]*',' ',speech)
        speech = re.sub('%[0-9|.]*', ' ', speech)
        speech = re.sub('$[0-9|.]*',' ', speech)
        for ch in " \"$!'@#%&()*+,-./:;<=>?[\\]^_`{|}~ ":
            speech = speech.replace(ch,' ')

        tokens = speech.split()
        
        #word lemmatization
        lmtzr = WordNetLemmatizer()
        tokens = [lmtzr.lemmatize(token) for token in tokens]
        tokens = [lmtzr.lemmatize(token,'v') for token in tokens]

        #tokens = bigrams(tokens)                    # uncomment this line, we can use bigram as
        unique_tokens_dict = collections.Counter(tokens)

        for fea in features:
            if fea in unique_tokens_dict:
                sample.append(unique_tokens_dict[fea])
            else:
                sample.append(0)
       
        #print(sample)
        sample_matrix.append(sample)
        label_vector.append(label)
    
    return sample_matrix,label_vector
Ejemplo n.º 7
0
def parseLine(line, stopWords_, wordInd, currWrd):
    """ Removes stop words and lemmas using nltk and punctuations 
    using re. Returns a list with valid words in the line. currWrd is
    the index of next word occurring for the first time
    """
    lineWords = []
    # Hypen in hyphenated words are removed e.g. wi-fi ==> wifi.
    line = re.sub('(\w)-(\w)',r'\1\2',line)
    # replace underscore with space     
    line = re.sub('(\w)_(\w)',r'\1 \2',line)    
    # Remove punctuation marks.
    line = re.sub("[',~`@#$%^&*|<>{}[\]\\\/.:;?!\(\)_+\"-]",r'',line)
    wnLmtzr = WordNetLemmatizer()    
    for word in line.split():
        # Get index of word from wordInd. If it is seen for the first 
        # time assign an index to the word.
        word = word.lower()    # case of words is ignored
        # Lemmatize word using word net function
        word = wnLmtzr.lemmatize(word, 'n')    # with noun
        word1 = wnLmtzr.lemmatize(word, 'v')    # with verb
        if len(word1) < len(word):    # select smaller of two
            word = word1                
        # Ignore stop words and numbers.
        if word in stopWords_ or \
                re.match('^\d+x?\d*$',word) is not None:
            continue
        # Update wordInd with number of occurrences of word.
        if word not in wordInd:                
            wordInd[word] = currWrd[0]
            currWrd[0] += 1
        # Update lineWords with word.
        lineWords.append(word)
    return lineWords
def getpurpose(matched,classname):
	lmtzr = WordNetLemmatizer()
	if classname=='class4' or classname=='class6' or classname=='class3':
		exp='\w*?ing NN\w*?'
		match=re.search(exp,matched)
		purpose_text=match.group().split()
		purpose=lmtzr.lemmatize(purpose_text[0],'v')
		return purpose
	if classname=='class2':
		exp='\w*? VB\w*?'
		match=re.search(exp,matched)
		purpose_text=match.group().split()
		purpose=lmtzr.lemmatize(purpose_text[0],'v')
		return purpose
	if classname=='class5' or classname=='class7':
		exp='for IN \w*? NN\w*?';
		match=re.search(exp,matched)
		purpose_text=match.group().split()
		purpose=lmtzr.lemmatize(purpose_text[2],'v')
		return purpose
	if classname=='class1' or 'class9':
		exp='\w*? IN \w*? VBG'
		match=re.search(exp,matched)
		if match:
			purpose_text=match.group().split()
			purpose=lmtzr.lemmatize(purpose_text[2],'v')
			return purpose
	if classname=='class1':
		exp='\w*? TO \w*? VB\w*? \w*? NN\w*?'
		match=re.search(exp,matched)
		if match:
			purpose_text=match.group().split()
			purpose=lmtzr.lemmatize(purpose_text[2],'v')
			return purpose
	return None
Ejemplo n.º 9
0
def data_preprocessing(file_path):
    f = open(file_path,'r')
    speech_list = f.read().split("###")   # read speeches, split with ###, and save them into list.
    del speech_list[-1]
    f.close()
    #print len(speech_list)
    f = open(file_path,'r')
    speeches = f.read().lower()    #set all letters lower case
    speeches = re.sub('http://[a-zA-Z0-9|/|.]*',' ',speeches)
    speeches = re.sub('%[0-9|.]*', ' ', speeches)
    speeches = re.sub('$[0-9|.]*',' ', speeches)
    #speeches = re.sub('\\\\xe2\\\\x80\\\\x[a-zA-Z0-9]*',' ',speeches)
    #print speeches
    for ch in " \"$!'@#%&()*+,-./:;<=>?[\\]^_`{|}~ ":
        speeches = speeches.replace(ch,' ')

    tokens = speeches.split()
    
    #word lemmatization
    lmtzr = WordNetLemmatizer()
    tokens = [lmtzr.lemmatize(token) for token in tokens]
    tokens = [lmtzr.lemmatize(token,'v') for token in tokens]

    #tokens = bigrams(tokens)                    # uncomment this line, we can use bigram as

    total_tokens_count = len(tokens)
    unique_tokens_dict = collections.Counter(tokens)   #key is word, value is the count,
                                                       #also default value 0 for non-exsit key.

    result = [ speech_list, unique_tokens_dict, total_tokens_count ]
    return result
def stemWordMatch(question,sentence):

    lmtzr = WordNetLemmatizer()

    question_tokens = set(nltk.word_tokenize(question))
    sentence_tokens=set(nltk.word_tokenize(sentence))

    count=0
    '''for i in sentence_tokens:
        #Finding the exact word match
        if lmtzr.lemmatize(i, 'v').lower() in  [lmtzr.lemmatize(x, 'v').lower() for x in question_tokens]:
            #print  'matching word is:',i
            count=count+6
        elif i.lower() in [x.lower() for x in question_tokens]:
            print 'i is :',i
            count=count+3
    #print 'Exact word match count is :',count'''

    for i in sentence_tokens:
        #Finding the exact word match

        if i.lower() in [x.lower() for x in question_tokens]:
            #print 'i is :',i
            count=count+3
        elif lmtzr.lemmatize(i, 'v').lower() in  [lmtzr.lemmatize(x, 'v').lower() for x in question_tokens]:
            #print  'matching word is:',i
            count=count+6

    #print 'Exact word match count is :',count


    return count
Ejemplo n.º 11
0
def getlemmas(tokens):
    lemmas = []
    for token in tokens:
        if len(token) < 2 or not isWord(token) or token == "the":
            lemmas.append({})
            continue
        
        tokenLemmas = {}
        #Synonyms
        for syn in wn.synsets(token):
            #Derived Forms and their Syns
            for lemma in syn.lemmas():
                for df in lemma.derivationally_related_forms():
                    for ln in df.synset().lemma_names():
                        tokenLemmas[ln] = 4
                    tokenLemmas[df.name()] = 3
            for lname in syn.lemma_names():
                tokenLemmas[lname] = 2
        
        #Wordnet lemmas
        l = WordNetLemmatizer()
        for x in ('v','a','s','r','n'):
            tmp = l.lemmatize(token, x)
            tokenLemmas[tmp] = 1
            tmp = l.lemmatize(tmp, x)
            tokenLemmas[tmp] = 1
        
        #Exact
        tokenLemmas[token] = 1
        
        lemmas.append(tokenLemmas)
    
    return lemmas
Ejemplo n.º 12
0
def lemma_tokenize(paragraph):
    lmtzr = WordNetLemmatizer()
    try:
        return [lmtzr.lemmatize(word).lower() for sentence in tokenize(paragraph) for word in sentence]
    except LookupError:
        nltk.download('wordnet')
        return [lmtzr.lemmatize(word).lower() for sentence in tokenize(paragraph) for word in sentence]
def extract_cooking_methods(input_steps, title):
    steps = copy.deepcopy(input_steps)
    steps.append(title)
    tk_steps = [pos_tag(word_tokenize(w.lower())) for w in steps]

    methods = []
    for step in tk_steps:
        # methods += [wordnet_lemmatizer.lemmatize(w, pos='v').encode('ascii', 'ignore') for (w, pos) in step if 'VB' in pos]
        methods += [w.encode('ascii', 'ignore') for (w, pos) in step if 'VB' in pos]

    for step in steps:
        if 'preheat' in step:
            methods += ['preheat', 'preheating']
        if 'microwav' in step:
            methods += ['microwave', 'microwaving']
        if 'place' in step:
            methods.append('place')
        if 'form' in step:
            methods.append('form')
        if 'sprinkle' in step:
            methods.append('sprinkle')

    wordnet_lemmatizer = WordNetLemmatizer()
    discard = ['be', 'use', 'need', 'should', 'allow', 'pink', 'turn', 'reserve']
    methods =  [m for m in methods if wordnet_lemmatizer.lemmatize(m, pos='v') not in discard and len(m) > 2]
    stems = [wordnet_lemmatizer.lemmatize(w, pos='v') for w in methods]
    gerunds = [w[:-1] + 'ing' for w in stems if w[-1] == 'e']
    gerunds +=  [w + 'ing' for w in stems if w[-1] != 'e']
    methods = list(set(methods + stems + gerunds))
    return methods
class LexicalBigramUnigramAnalyzer(object):   
    def __init__(self):
        self.lemmatizer = WordNetLemmatizer()    
        self.tb = Blobber(pos_tagger=PerceptronTagger())
        self.sentencer = SentenceTokenizer()
    def __call__(self, doc):   
        tokens = []     
        for sent in self.sentencer.tokenize(doc.decode('ascii','ignore')):
            tagged = self.tb(sent.lower()).tags    
            
            tagged = [(t[0], penn_to_wn(t[1])) for t in tagged]
            tagged = [(t[0], t[1]) for t in tagged if t[0] not in stopwords.words('english')]
            ng = zip(tagged, tagged[1:])
            rule1 = [(t[0],t[1]) for t in ng if t[0][1]== wn.ADJ and t[1][1]== wn.NOUN]
            rule2 = [(t[0],t[1]) for t in ng if (t[0][1]== wn.ADV and t[1][1]== wn.VERB) or (t[0][1]== wn.VERB and t[1][1]== wn.ADV)]
            rule3 = [(t[0],t[1]) for t in ng if t[0][1]== wn.VERB and t[1][1]== wn.VERB]
            rule4 = [(t[0],t[1]) for t in ng if t[0][1]== wn.NOUN and t[1][1]== wn.NOUN]
            
            filtered_list = rule1 + rule2 + rule3 + rule4
                             
                    
            # Lemmatize
            filtered_bigrams = [self.lemmatizer.lemmatize(t[0][0], t[0][1]) + ' ' + self.lemmatizer.lemmatize(t[1][0], t[1][1]) for t in filtered_list]
            filtered_unigrams = [self.lemmatizer.lemmatize(w[0], w[1]) for w in tagged]
            for bigram in filtered_bigrams:
                tokens.append(bigram)
            for unigram in filtered_unigrams:
                tokens.append(unigram)
        return tokens
Ejemplo n.º 15
0
def single_master_list(data):
    my_vocab = deepcopy(init_to_zero_vocab)
    data = data.lower()
    data = re.sub("\S+@\S", " EMAILREPLACED ", data)
    data = re.sub("\d+", " NUMBERREPLACED ", data)
    data = re.sub("\s?http:s?\/\/\w{0,3}\.\w+\.\w{0,3}\S?|w{0,3}\.\w+\.\w{0,3}\S?", " URLREPLACED ", data)
    for punct in string.punctuation:
        data = data.replace(punct," ")
    format_data = data.split()
    no_stop_words = []
    l = WordNetLemmatizer()
    for word in format_data:
        if (stop):
            if word not in stopwords.words('english'):
                if (lem):
                    no_stop_words.append(l.lemmatize(word))
                else:
                    no_stop_words.append(word)
        else:
            if (lem):
                no_stop_words.append(l.lemmatize(word))
            else:
                no_stop_words.append(word)
            
    for element in no_stop_words:
        if(element in my_vocab):
            my_vocab[element] += 1

    return my_vocab
Ejemplo n.º 16
0
def lemmatize(w,p):
    if p.startswith("N"):
        return (wnl.lemmatize(wnl,w,'n'),p)
    elif p.startswith("V"):
        return (wnl.lemmatize(wnl,w,'v'),p)
    else:
        return (w,p)
Ejemplo n.º 17
0
def get_dante_answers(senseval_data):
    # TODO: implement probability based inference of accuracy, i.e. POS adds prob, colloc adds prob, phrase adds prob
    #  - must find values for probs first. for colloc - adjacency affects it. for phrase - order affects it
    # Or, just test adjacency, presence of colloc and phrase words in the sentence (test both lemmatized and not)
    # Methods: Set arbitrary values and adjust manually
    #          Use a learning algorithm to find the best mix of values
    DanteAPI.initialize()
    dante = DanteAPI.get_all_word_meanings()
    print "\nDANTE parsing completed"
    dante_answers = {}
    lemmatizer = WordNetLemmatizer()
    for sentence_data in senseval_data:
        for phrase in sentence_data["test_phrases"]:
            word_id, raw_word = phrase["headword"]
            word = lemmatizer.lemmatize(raw_word)
            phrase_meaning = _answer_phrase(word, sentence_data, dante)
            if phrase_meaning is not None:
                dante_answers[word_id] = phrase_meaning
            else:
                dante_answers[word_id] = _answer_word(word, sentence_data, dante)

        for word_id, raw_word in sentence_data["test_words"].iteritems():
            word = lemmatizer.lemmatize(raw_word)
            dante_answers[word_id] = _answer_word(word, sentence_data, dante)
    return dante_answers
Ejemplo n.º 18
0
def getting_sentiment(word,pos):
    flag = 0
    if 'NN' in pos:
        tag = 'n'
    elif 'JJ' in pos:
        tag = 'a'
        if pos == 'JJS':
            flag = 1
    elif 'VB' in pos:
        tag = 'v'
    elif 'RB' in pos:
        tag = 'r'
    else:
        tag = ''
    stemmer = WordNetLemmatizer()
    if tag != '':
        x = stemmer.lemmatize(word,tag)
    else:
        x = stemmer.lemmatize(word)

    try:
        score = float(score_dic[x]) #* float(m1)
    except KeyError:
        if len(swn.senti_synsets(x,tag)) > 0:
            score = swn.senti_synsets(x,tag)[0].pos_score() * 5
        else:
            score = 100

    if flag == 1 and score != -100 and score < 4:
        score = score + 1
    elif flag == 1 and score != -100 and score > -4 and score < 0:
        score = score - 1
    print word + '--->' + str(score)
    return score
Ejemplo n.º 19
0
def get_singular_forms_NN(plural_forms, nn):
    lemmatizer = WordNetLemmatizer()
    singular_forms = []
    for w in set(plural_forms):
        if lemmatizer.lemmatize(w) in nn:
            singular_forms.append(lemmatizer.lemmatize(w))
        
    return singular_forms
Ejemplo n.º 20
0
def lemmatize(text,pos=None):
    from nltk.stem.wordnet import WordNetLemmatizer
    global _wnl
    if not _wnl:
        _wnl = WordNetLemmatizer()
    if pos:
        return _wnl.lemmatize(text,pos)
    return _wnl.lemmatize(text)
Ejemplo n.º 21
0
 def searchString(self, sentence, search_word):
     # search sentence for given word, lemmatize everything
     lemm = WordNetLemmatizer()
     lem_search = lemm.lemmatize(search_word)
     for idx,word in enumerate(sentence.split(' ')):
         if lemm.lemmatize(word).lower() == lem_search.lower():
             return idx
         else:
             return -1
Ejemplo n.º 22
0
	def l(tags_list):
		tags_list = ast.literal_eval(tags_list)
		lmtzr = WordNetLemmatizer()
		return_tags_list = []
		for t in list(tags_list):
			if get_wordnet_pos(t[1]):
				return_tags_list.append(lmtzr.lemmatize(t[0],get_wordnet_pos(t[1]))) 
			else:
				return_tags_list.append(lmtzr.lemmatize(t[0])) 
		return return_tags_list
Ejemplo n.º 23
0
def lmtz(word):

    wl = WordNetLemmatizer()
    words = word.split("/")
    word = words[0]
    tag = words[1]
    if tag.startswith("V"):
        return wl.lemmatize(word, "v") + "/" + tag
    else:
        return wl.lemmatize(word) + "/" + tag
def stemmer_word(text):
    text1 = nltk.word_tokenize(text)
    lmtzr = WordNetLemmatizer()
    tagged = nltk.pos_tag(text1)
    for element in tagged:
        if get_wordnet_pos(element[1])!= 0:
            text = text.replace(element[0],lmtzr.lemmatize(element[0],get_wordnet_pos(element[1])))
        else:
            text = text.replace(element[0],lmtzr.lemmatize(element[0]))
    return text
Ejemplo n.º 25
0
def extract_phrases(text):
	lmtzr = WordNetLemmatizer()
	token_buffer = []
	tokens = word_tokenize(text.lower())
	annotation = pos_tag(tokens)
	for (token, pos) in annotation:
		if pos in valid_POS:
			if len(lmtzr.lemmatize(token)) > 1:
				token_buffer.append(lmtzr.lemmatize(token))
	return token_buffer
Ejemplo n.º 26
0
def process_lemm(newLine):
	newLine = newLine.split()
	lmtzr = WordNetLemmatizer()
	for x in range(0,len(newLine)):
		newLine[x] = lmtzr.lemmatize(newLine[x])
		newLine[x] = lmtzr.lemmatize(newLine[x], "v")
	
	
	newLine = " ".join(newLine)
	#print newLine
	return newLine
Ejemplo n.º 27
0
def lemmatize(article):
    '''
    INPUT: string
    OUTPUT: lemmatized string

    Lemmatizes all of the words in an article.
    '''
    lem = WordNetLemmatizer()
    article_lem = ' '.join([lem.lemmatize(lem.lemmatize(word, pos ='v')) for word in article.split()])
    article_lem = ' '.join([lem.lemmatize(lem.lemmatize(word)) for word in article_lem.split()])
    return article_lem
Ejemplo n.º 28
0
def get_singular_sentence(sentence):
    lmtzr = WordNetLemmatizer()
    keywords = extract_keywords(sentence.lower())
    singular_words = []
    for keyword in keywords:
        if (lmtzr.lemmatize(keyword[0])):
            singular_words.append(lmtzr.lemmatize(keyword[0]))
        else:
            singular_words.append(keyword[0])

    return ' '.join(singular_words)
Ejemplo n.º 29
0
def initializeData(data): 
#    graphics_train = fetch_20newsgroups(subset = dataSet,\
#    categories = categories, shuffle = True, random_state = 42)
     
    wnl = WordNetLemmatizer()
    stop_words = text.ENGLISH_STOP_WORDS
     
    data = data 
    #List of dicts, each element represents word to number mapping for each document
    termDictList = []
    #Dictionary for each term which stores the number of documents that contains this term
    termDocCountDict = {}
    # set of term 
    termSet = set()
    # list of int, each element represents total number of terms in each tokenlized documment
    termCountList = []    
     
    # get focument frequency for each term
    for i in range(len(data)):
        document = data[i].lower()
        words = set(word_tokenize(document))
        for word in words:
            if word.isalpha():
                term = wnl.lemmatize(word)
                if term not in stop_words:
                    if term not in termDocCountDict:
                        termDocCountDict[term] = 0
                    termDocCountDict[term] += 1
     
    # get termDict and termSet
    for i in range(len(data)):
        termDict = {}
        termCount = 0
        document = data[i].lower()
        words = word_tokenize(document)
        for word in words:
            if word.isalpha():
                term = wnl.lemmatize(word)
                if term not in stop_words:
                    if term in termDocCountDict:
                        if termDocCountDict[term] >= 110 and termDocCountDict[term] <= 11000:
                            termSet.add(term)
                            termCount += 1
                            # fill in termDict
                            if term not in termDict:
                                termDict[term] = 0
                            termDict[term] += 1
                        else:
                            del termDocCountDict[term]
        termDictList.append(termDict)
        termCountList.append(termCount)
         
    return (termDictList, termCountList, termDocCountDict, termSet)
Ejemplo n.º 30
0
def get_tokens(words):
    """returns list of tokens"""
    wnl = WordNetLemmatizer()
    for i in range(0, len(words)):
        words[i] = words[i].lower()
        words[i] = re.sub(ur"\W", "", words[i], flags=re.U)
        wnl.lemmatize(words[i])
    stpwrd = stopwords.words('english')
    stpwrd.extend(['m','re','o','d','vs','w','3','2','rt','u','ll','ve'])
    tokens = [i for i in words if i not in stpwrd]
    #print tokens
    return tokens
Ejemplo n.º 31
0
stopwords.extend(newstoplist)
new_list = []

#Tokenization and removal of stopwords
for sent1 in process_list:
    newsent = " ".join(sent1)
    word_tokens = word_tokenize(newsent)
    filtered_sentence = [w for w in word_tokens if w not in stopwords]
    #print(filtered_sentence)
    new_list.append(filtered_sentence)

#introducing lemmatization
lemma = WordNetLemmatizer()
new_list2 = []
for sent1 in new_list:
    normalized = " ".join(lemma.lemmatize(word, 'n') for word in sent1)
    x = normalized.split()
    y = [s for s in x if len(s) > 2]
    new_list2.append(y)

#Using Bigrams
texts = new_list2
phrases = Phrases(new_list2)
bigram = Phraser(phrases)
texts = [bigram[line] for line in new_list2]

# NMF is able to use tf-idf, so using TFIDF
no_features = 750
tfidf_vectorizer = TfidfVectorizer(max_df=0.95,
                                   min_df=2,
                                   max_features=no_features,
Ejemplo n.º 32
0
def lemmatize_word(word):
    if len(word) < 4:
        return word
    lem = WordNetLemmatizer()
    return lem.lemmatize(lem.lemmatize(word, "n"), "v")
# Query System
while True:
    query = input("Enter Query\n")
    if query == "exit":
        break

    # Query Preprocessing
    query = query.lower()
    query = query.split(' ')

    # Lemmatization
    temp = []
    for word in name:
        if word not in stop_words:
            temp.append(lem.lemmatize(word))
    name = temp

    print(query)

    # Creating an empty graph for output
    T = nx.empty_graph(0, create_using=nx.MultiDiGraph())

    for q_term in query:
        if q_term in inverted_index:

            # Fetch the nodes and edges corresponding to each term
            candidates = inverted_index[q_term]
            print('Candidates:', candidates)
            for term in candidates:
                #  Load the edges with term name, if they exist
termSet = set()
termSet = getTerms(termSet, termDict)

with open(remappedFile) as readFile:
    idSet = {line.split('\t')[0] for line in readFile}

with open(descFile) as readFile:
    termDict = {
        eval(line.split('\t')[0]): eval(line.split('\t')[1])
        for line in readFile if eval(line.split('\t')[0]) in idSet
    }

termSet = getTerms(termSet, termDict)

remove_words = []
remove_words.append('cell')
remove_words.append('neoplasm')
remove_words.append('neoplasms')
remove_words.append('multiple')

[termSet.remove(term) for term in remove_words]

if nlpType == 'stem':
    snow = SnowballStemmer('english')
    termSet = {snow.stem(term) for term in termSet}
elif nlpType == 'lemmatize':
    lemmatizer = WordNetLemmatizer()
    termSet = {lemmatizer.lemmatize(term) for term in termSet}

with open(keepWordFile, 'w') as writeFile:
    [writeFile.write(term + '\n') for term in termSet]
Ejemplo n.º 35
0
class JournalTitleAbbreviationProvider(StashableBase):
    """Manage resources required to support journal title abbreviation assignment
    using ISO LTWA abbreviations at:

      https://www.issn.org/services/online-services/access-to-the-ltwa/

    Portions of this module have been adapted from the approach developed
    in https://github.com/adlpr/iso4.git with the following license:

        MIT License

        Copyright (c) 2018 Alex DelPriore

        Permission is hereby granted, free of charge, to any person obtaining a copy
        of this software and associated documentation files (the "Software"), to deal
        in the Software without restriction, including without limitation the rights
        to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
        copies of the Software, and to permit persons to whom the Software is
        furnished to do so, subject to the following conditions:

        The above copyright notice and this permission notice shall be included in all
        copies or substantial portions of the Software.
    """
    def __init__(self, **kwargs):
        dirName = "journal-abbreviations"
        cachePath = kwargs.get("cachePath", ".")
        super(JournalTitleAbbreviationProvider,
              self).__init__(cachePath, [dirName])
        urlTargetIsoLtwa = kwargs.get(
            "urlTargetLtwa",
            "https://www.issn.org/wp-content/uploads/2013/09/LTWA_20160915.txt"
        )
        dirPath = os.path.join(cachePath, dirName)
        useCache = kwargs.get("useCache", True)
        #
        self.__noAbbrevPlaceHolder = "n.a."
        self.__prefixKey = "prefix"
        self.__suffixKey = "suffix"
        self.__infixKey = "infix"
        self.__fullWordKey = "full"
        self.__lowercaseFlag = "lower"
        self.__uppercaseFlag = "upper"
        self.__titlecaseFlag = "title"
        #
        self.__wml = WordNetLemmatizer()
        #
        self.__stopWords = set([
            "a",
            "about",
            "afore",
            "after",
            "ago",
            "along",
            "amid",
            "among",
            "amongst",
            "an",
            "and",
            "apropos",
            "as",
            "at",
            "atop",
            "but",
            "by",
            "ca",
            "circa",
            "for",
            "from",
            "hence",
            "in",
            "into",
            "like",
            "nor",
            "of",
            "off",
            "on",
            "onto",
            "ontop",
            "or",
            "out",
            "over",
            "per",
            "since",
            "so",
            "than",
            "the",
            "though",
            "til",
            "till",
            "to",
            "unlike",
            "until",
            "unto",
            "up",
            "upon",
            "upside",
            "versus",
            "via",
            "vis-a-vis",
            "vs",
            "when",
            "whenever",
            "where",
            "whereas",
            "wherever",
            "while",
            "with",
            "within",
            "yet",
            "aus",
            "des",
            "der",
            "für",
            "im",
            "und",
            "zu",
            "zur",
            "da",
            "de",
            "del",
            "della",
            "delle",
            "di",
            "do",
            "e",
            "el",
            "en",
            "et",
            "i",
            "la",
            "le",
            "lo",
            "las",
            "les",
            "los",
            "y",
            "van",
            "voor",
            "og",
        ])
        self.__abbrevD, self.__conflictD, self.__multiWordTermList = self.__rebuildCache(
            urlTargetIsoLtwa, dirPath, useCache)
        # Token a string space boundaries respecting a special list of multi-word strings -
        self.__tokenizerRegex = re.compile("({}|\\s+)".format("|".join([
            "(?:^|\\s){}(?:\\s|$)".format(w) for w in self.__multiWordTermList
        ])),
                                           flags=re.I)

    def testCache(self):
        # Lengths ...
        try:
            logger.info("Abbreviation length LTWA %d",
                        len(self.__abbrevD["full"]))
            if len(self.__abbrevD) == 4 and len(
                    self.__abbrevD["full"]) > 39000 and len(
                        self.__multiWordTermList) > 250:
                return True
        except Exception:
            pass
        return False

    def __rebuildCache(self, urlTargetIsoLtwa, dirPath, useCache):
        """Rebuild the cache of ISO abbreviation term data

        Args:
            urlTargetIsoLtwa (str): URL for ISO4 LTWA title word abbreviations
            dirPath (str):  cache path
            useCache (bool):  flag to use cached files

        Returns:
            tuple: (dict) title word abbreviations
                   (dict) language conflict dictionary
                   (list) multi-word abbreviation targets

        Notes:
            ISO source file (tab delimited UTF-16LE) is maintained at the ISSN site -
            https://www.issn.org/wp-content/uploads/2013/09/LTWA_20160915.txt
        """
        aD = {}
        mU = MarshalUtil(workPath=dirPath)
        fmt = "json"
        ext = fmt if fmt == "json" else "pic"
        isoLtwaNamePath = os.path.join(dirPath, "iso-ltwa.%s" % ext)
        logger.debug("Using cache data path %s", dirPath)
        mU.mkdir(dirPath)
        if not useCache:
            for fp in [isoLtwaNamePath]:
                try:
                    os.remove(fp)
                except Exception:
                    pass
        #
        if useCache and mU.exists(isoLtwaNamePath):
            aD = mU.doImport(isoLtwaNamePath, fmt=fmt)
            logger.debug("Abbreviation name length %d", len(aD["abbrev"]))
        elif not useCache:
            # ------
            fU = FileUtil()
            logger.info("Fetch data from source %s in %s", urlTargetIsoLtwa,
                        dirPath)
            fp = os.path.join(dirPath, fU.getFileName(urlTargetIsoLtwa))
            ok = fU.get(urlTargetIsoLtwa, fp)
            aD = self.__getLtwaTerms(dirPath, fp)
            ok = mU.doExport(isoLtwaNamePath, aD, fmt=fmt)
            logger.debug("abbrevD keys %r", list(aD.keys()))
            logger.debug("Caching %d ISO LTWA in %s status %r",
                         len(aD["abbrev"]), isoLtwaNamePath, ok)
        #
        abbrevD = aD["abbrev"] if "abbrev" in aD else {}
        conflictD = aD["conflicts"] if "conflicts" in aD else {}
        multiWordTermL = aD[
            "multi_word_abbrev"] if "multi_word_abbrev" in aD else []
        #
        return abbrevD, conflictD, multiWordTermL

    def getJournalAbbreviation(self, title, usePunctuation=True):
        #
        useLangs = ["eng"]
        title = unicodedata.normalize("NFKD", title)
        useLangs = set(useLangs)

        # split title either at space on as defined as multi-word targets
        titleWords = list(
            filter(lambda w: w.strip(), self.__tokenizerRegex.split(title)))

        retWordList = []

        # Exception for single-word titles
        if len(titleWords) == 1 and len(titleWords[0].split(" ")) == 1:
            return title

        for origWord in titleWords:
            # normalize and lemmatize
            wordNorm = self.__normalizeWord(origWord)

            # skip stopwords
            if wordNorm in self.__stopWords:
                continue

            # if normalized word fails, try lemma
            wordLemma = self.__wml.lemmatize(wordNorm)
            wordCandidates = (
                wordNorm, wordLemma) if wordNorm != wordLemma else (wordNorm, )

            wordAbbr = ""
            capitalization = self.__getCapitalization(origWord)

            for word in wordCandidates:
                # Check for language degeneracy in mapping
                if self.__fullWordKey in self.__conflictD and word in self.__conflictD[
                        self.__fullWordKey]:
                    allowedLangs = self.__conflictD[
                        self.__fullWordKey][word].keys()
                    possibleLangs = allowedLangs & useLangs
                    if len(possibleLangs) == 1:
                        wordAbbr = self.__conflictD[self.__fullWordKey][word][
                            possibleLangs.pop()]
                        break
                    else:
                        logger.error(
                            "Language mapping conflict for term %r (%r)", word,
                            allowedLangs)
                        return title
                if not wordAbbr and self.__prefixKey in self.__conflictD:
                    # prefix conflicts
                    for prefix in sorted(
                            self.__conflictD[self.__prefixKey].keys()):
                        if word.startswith(prefix):
                            allowedLangs = self.__conflictD[
                                self.__prefixKey][word].keys()
                            possibleLangs = allowedLangs & useLangs
                            if len(possibleLangs) == 1:
                                wordAbbr = self.__conflictD[self.__prefixKey][
                                    word][possibleLangs.pop()]
                            else:
                                logger.error(
                                    "Language mapping conflict for term %r (%r)",
                                    word, allowedLangs)
                                return title

                if not wordAbbr and self.__suffixKey in self.__conflictD:
                    # suffix conflicts
                    for suffix in sorted(
                            self.__conflictD[self.__suffixKey].keys()):
                        if word.endswith(suffix):
                            allowedLangs = self.__conflictD[
                                self.__suffixKey][word].keys()
                            possibleLangs = allowedLangs & useLangs
                            if len(possibleLangs) == 1:
                                wordAbbr = self.__conflictD[self.__suffixKey][
                                    word][possibleLangs.pop()]
                            else:
                                logger.error(
                                    "Language mapping conflict for term %r (%r)",
                                    word, allowedLangs)
                                return title

                if not wordAbbr and self.__infixKey in self.__conflictD:
                    # infix conflicts
                    for infix in sorted(
                            self.__conflictD[self.__infixKey].keys()):
                        if infix in word:
                            allowedLangs = self.__conflictD[
                                self.__infixKey][word].keys()
                            possibleLangs = allowedLangs & useLangs
                            if len(possibleLangs) == 1:
                                wordAbbr = self.__conflictD[
                                    self.__infixKey][word][possibleLangs.pop()]
                            else:
                                logger.error(
                                    "Language mapping conflict for term %r (%r)",
                                    word, allowedLangs)
                                return title
                if wordAbbr:
                    break

                # Evaluate abbreviation mapping for each word type
                if not wordAbbr and self.__fullWordKey in self.__abbrevD and word in self.__abbrevD[
                        self.__fullWordKey]:
                    wordAbbr = self.__abbrevD[self.__fullWordKey][word]
                    break
                if not wordAbbr and self.__prefixKey in self.__abbrevD:
                    # check prefixes in descending length order
                    for prefix in sorted(
                            self.__abbrevD[self.__prefixKey].keys(),
                            key=lambda p: (-len(p), p)):
                        if word.startswith(prefix):
                            wordAbbr = self.__abbrevD[self.__prefixKey][prefix]
                            break
                if not wordAbbr and self.__suffixKey in self.__abbrevD:
                    # check suffixes in descending length order
                    for suffix in sorted(
                            self.__abbrevD[self.__suffixKey].keys(),
                            key=lambda p: (-len(p), p)):
                        if word.endswith(suffix):
                            wordAbbr = self.__abbrevD[self.__suffixKey][suffix]
                            break
                if not wordAbbr and self.__infixKey in self.__abbrevD:
                    # check infixes in descending length order
                    for infix in sorted(self.__abbrevD[self.__infixKey].keys(),
                                        key=lambda p: (-len(p), p)):
                        if infix in word:
                            wordAbbr = self.__abbrevD[self.__infixKey][infix]
                            break
                if wordAbbr:
                    break

            # Apply formating preferences
            if wordAbbr in ("", self.__noAbbrevPlaceHolder):
                wordAbbr = self.__finalizeOutput(word,
                                                 capitalization,
                                                 usePunctuation=False)
            else:
                wordAbbr = self.__finalizeOutput(wordAbbr, capitalization,
                                                 usePunctuation)

            retWordList.append(wordAbbr)
        return unicodedata.normalize("NFKC", " ".join(retWordList))

    def __getType(self, word):
        """Classify the input word base on internal punctuation."""
        if word.startswith("-"):
            return self.__infixKey if word.endswith("-") else self.__suffixKey
        elif word.endswith("-"):
            return self.__prefixKey
        else:
            return self.__fullWordKey

    def __getCapitalization(self, word):
        """Classify case construction of the input term.

        Args:
            word (str): Input term to be evaluated

        Returns:
            (str): flag indicating case ('upper', 'lower', 'title')
        """
        if word == word.upper():
            return self.__uppercaseFlag
        elif word[0].isupper():
            # guess title case if not all upper
            return self.__titlecaseFlag
        else:
            return self.__lowercaseFlag

    def __normalizeWord(self, word):
        """Strip hyphens, other punctuation, lower, normalize NFKD."""
        parts = []
        for part in word.split(" "):
            part = re.sub(r"(^\-|\p{P}+$)", "", part).strip()
            parts.append(unicodedata.normalize("NFKD", part.lower()))
        return " ".join(parts).strip()

    def __normalizeAbbr(self, abbr):
        """Strip hyphens, period, lower, normalize NFKD (if not "n.a.")."""
        if abbr == self.__noAbbrevPlaceHolder:
            return abbr
        parts = []
        for part in abbr.split(" "):
            parts.append(
                unicodedata.normalize("NFKD",
                                      part.strip("- ").rstrip(".").lower()))
        return " ".join(parts)

    def __finalizeOutput(self, word, capitalization, usePunctuation):
        """Modify output word according to capitalization and punctuation preferences."""
        parts = []
        for part in word.split(" "):
            if capitalization == self.__uppercaseFlag:
                part = part.upper()
            elif capitalization == self.__titlecaseFlag:
                part = string.capwords(part)
            if usePunctuation:
                part += "."
            parts.append(part)
        return " ".join(parts)

    def __getLtwaTerms(self, dirPath, isoLtwaNamePath):
        logger.info("Processing terms in %r", isoLtwaNamePath)
        titleWordAbbrevD = {}
        conflictD = {}
        multiWordTermL = []
        abbrevD = {
            "abbrev": titleWordAbbrevD,
            "conflicts": conflictD,
            "multi_word_abbrev": multiWordTermL
        }
        #
        mU = MarshalUtil(workPath=dirPath)
        try:
            tsv = mU.doImport(isoLtwaNamePath,
                              fmt="tdd",
                              rowFormat="list",
                              encoding="utf-16-le")
            logger.debug("Read isoLtwaNamePath %s record count %d",
                         isoLtwaNamePath, len(tsv))
            conflictWords = set()
            for line in tsv:
                try:
                    if len(line) == 3:
                        word, abbr, langs = line
                    else:
                        word, abbr = line
                        langs = ""

                except Exception:
                    logger.error("Format issue for line %r", line)
                    continue
                wType = self.__getType(word)
                word = self.__normalizeWord(word)
                abbr = self.__normalizeAbbr(abbr)
                # Assign word type -
                if wType not in titleWordAbbrevD:
                    titleWordAbbrevD[wType] = {}
                # Detect conflict words
                if word in titleWordAbbrevD[wType]:
                    conflictWords.add((wType, word))
                elif " " in word:
                    multiWordTermL.append(re.escape(word))
                #
                titleWordAbbrevD[wType][word] = abbr
            # Build dictionary capturing degenerate language specific mappings
            for wType, word in conflictWords:
                # remove from main list
                titleWordAbbrevD[wType].pop(word)
            logger.debug("conflict words length %d", len(conflictWords))
            for line in tsv:
                try:
                    if len(line) == 3:
                        word, abbr, langs = line
                    else:
                        word, abbr = line
                        langs = ""
                except Exception:
                    logger.error("Format issue for line %r", line)
                    continue
                wType = self.__getType(word)
                word = self.__normalizeWord(word)
                logger.debug("Word %r wordType %r", word, wType)
                if (wType, word) in conflictWords:
                    abbr = self.__normalizeAbbr(abbr)
                    if wType not in conflictD:
                        conflictD[wType] = {}
                    if word not in conflictD[wType]:
                        conflictD[wType][word] = {}
                    for lang in langs.split(","):
                        conflictD[wType][word][lang.strip()] = abbr
            multiWordTermL = sorted(list(set(multiWordTermL)))
            #
            abbrevD = {
                "abbrev": titleWordAbbrevD,
                "conflicts": conflictD,
                "multi_word_abbrev": multiWordTermL
            }
            for ky in abbrevD["abbrev"]:
                logger.debug("abbreviation type %r length %r", ky,
                             len(abbrevD["abbrev"][ky]))
            for ky in abbrevD:
                logger.debug("Content type %r length %r", ky, len(abbrevD[ky]))
            #
        except Exception as e:
            logger.exception("Failing reading %s with %s", isoLtwaNamePath,
                             str(e))

        return abbrevD
class SeoKeywords:
    def __init__(self):
        self.connect_db()

        self.stemmer = LancasterStemmer()
        self.lemmatizer = WordNetLemmatizer()

        self.noun_dict = {}
        self.adj_dict = {}

        #configuration
        self.punctuation = '!"#$%&\'()*+,./:;<=>?@[\\]^_`{|}~'
        self.filter_occurrence_threshold = 2
        self.sample_rate = 10

    def connect_db(self):
        self.db = MySQLdb.connect(host="localhost",
                                  user="******",
                                  passwd="!ac-okl.34.731",
                                  db="king")
        self.cursor = self.db.cursor(cursorclass=MySQLdb.cursors.DictCursor)

    def run(self):
        #gather product corpus per category
        category_ids = self.get_category_ids_with_many_products()

        for category_id in category_ids:
            products = self.get_products_by_category_id(category_id)
            pc = 0
            for product in products:
                pc += 1

                if pc % self.sample_rate != 0:
                    continue

                print pc
                self.process_product(product)

        print 'product count: ' + str(pc)

        self.nouns = [
            k for k, v in self.noun_dict.iteritems()
            if v > self.filter_occurrence_threshold
        ]
        self.adjectives = [
            k for k, v in self.adj_dict.iteritems()
            if v > self.filter_occurrence_threshold
        ]

        print 'nouns'
        print len(self.nouns)
        print self.nouns

        print 'adjectives'
        print len(self.adjectives)
        print self.adjectives

    def process_product(self, product):
        #1
        sentence = self.clean_product_string(product)

        #2
        words = self.tokenize_sentence(sentence)

        #3
        nouns, adjectives = self.extract_nouns_and_adjectives(words)

        for noun in nouns:
            self.noun_dict[noun] = self.noun_dict.get(noun, 0) + 1

        for adjective in adjectives:
            self.adj_dict[adjective] = self.adj_dict.get(adjective, 0) + 1

    #processing the corpus
    def clean_product_string(self, product):
        print 'clean_product_corpus:'
        sentence = product['name']  #+ '\n' + product['description']

        #strip lines
        #sentence = " ".join(sentence.splitlines())
        #strip html

        sentence = nltk.clean_html(sentence)

        #sentence = " ".join([str(s) for s in BeautifulSoup(sentence).findAll(text=True)])

        #strip punctuations (translate is fastest)
        sentence = sentence.translate(None, self.punctuation)

        #TODO:filter more words

        return sentence

    def tokenize_sentence(self, sentence):
        print sentence
        print '-->'
        #tokenize, lower case, remove stop words, lemmatize
        words = [
            self.lemmatizer.lemmatize(w.lower())
            for w in nltk.word_tokenize(sentence)
            if w not in stopwords.words('english')
        ]
        #words = [self.stemmer.stem(w) for w in words]
        return words

    def extract_nouns_and_adjectives(self, words):
        tags = nltk.pos_tag(words)
        print tags

        nouns = set([t[0] for t in tags if t[1] in ['NN']])
        adjectives = set([t[0] for t in tags if t[1] in ['JJ']])
        return (nouns, adjectives)

    #this return the categories with more than 10000 products
    def get_category_ids_with_many_products(self):
        print 'get_category_ids_with_many_products'
        return [120401]
        return [
            100101, 100402, 109999, 110102, 110106, 110804, 119999, 120203,
            120401, 130102, 130804, 131503, 139999, 150901, 160501, 170301,
            170401, 179999, 190301, 200103
        ]

        query = """SELECT cd.category_id, cd.name, cd.description, count(p2c.product_id) as product_count
                FROM category c
                JOIN category_description cd on c.category_id = cd.category_id,
                product_to_category p2c
                WHERE c.category_id = p2c.category_id
                GROUP BY 1
                HAVING count(p2c.product_id) > 10000"""
        self.cursor.execute(query)
        results = self.cursor.fetchall()

        category_tuples = [(row['category_id'], row['name'])
                           for row in results]
        return category_tuples

    def get_products_by_category_id(self, category_id):
        print 'get_products_by_category_id:' + str(category_id)
        query = """SELECT pd.product_id, pd.name, pd.description
                FROM product_to_category p2c, product_description pd
                WHERE p2c.product_id = pd.product_id
                AND p2c.category_id = %s;"""
        self.cursor.execute(query, category_id)
        products = self.cursor.fetchall()
        print 'count:' + str(len(products))
        return products

    def end(self):
        self.db.close()
        label_id = len(labels_index)
        labels_index[name] = label_id
        for fname in sorted(os.listdir(path)):
            fpath = os.path.join(path, fname)
            if sys.version_info < (3,):
                f = open(fpath)
            else:
                f = open(fpath, encoding='latin-1')
            t = f.read()
            t = t.replace('\n', '')
            t = re.sub(r'[^\w\s]','',t)
            tokens = word_tokenize(t)
            #filtered_tokens = [word for word in tokens if word not in stopwords.words('english')]
            lmtzr = WordNetLemmatizer()
            lems = [lmtzr.lemmatize(t) for t in tokens]
            t = " ".join(lems)
            texts.append(t)
            f.close()
            labels.append(label_id)

print('Found %s texts.' % len(texts))

# finally, vectorize the text samples into a 2D integer tensor
tokenizer = Tokenizer(num_words=MAX_NB_WORDS)
tokenizer.fit_on_texts(texts)
sequences = tokenizer.texts_to_sequences(texts)

word_index = tokenizer.word_index
with open("word_index.pkl", "wb") as w_index_file:
    pickle.dump(word_index, w_index_file)
Ejemplo n.º 38
0
def Lemmatization(word):
    return WordNetLemmatizer.lemmatize(word,"v")
Ejemplo n.º 39
0
def createKeyWords(log_directory, channel_name, output_directory, startingDate,
                   startingMonth, endingDate, endingMonth):
    """ outputs the keywords for each user on a particular channel
	after normalising the frequency and removing the common stop words.

    Args:
        log_directory (str): Location of the logs (Assumed to be arranged in directory structure as : <year>/<month>/<day>/<log-file-for-channel>.txt)
        channel_name (str): Channel to be perform analysis on
        output_directory (str): Location of output directory
        startingDate (int): Date to start the analysis (in conjunction with startingMonth)
        startingMonth (int): Date to start the analysis (in conjunction with startingDate)
        endingDate (int): Date to end the analysis (in conjunction with endingMonth)
        endingMonth (int): Date to end the analysis (in conjunction with endingDate)

    Returns:
       null 

    """

    out_dir_nick_change = output_directory + "key-words/"
    user_words_dict = []
    user_keyword_freq_dict = []
    nick_same_list = [
        [] for i in range(5000)
    ]  #list of list with each list having all the nicks for that particular person
    keywords_filtered = []
    no_messages = 0

    # print "Creating a new output folder"
    # os.system("rm -rf "+out_dir_nick_change)
    # os.system("mkdir "+out_dir_nick_change)

    rem_time = None  #remembers the time of the last message of the file parsed before the current file

    for folderiterator in range(startingMonth, endingMonth + 1):
        temp1 = "0" if folderiterator < 10 else ""
        for fileiterator in range(
                startingDate if folderiterator == startingMonth else 1,
                endingDate + 1 if folderiterator == endingMonth else 32):
            temp2 = "0" if fileiterator < 10 else ""
            filePath = log_directory + temp1 + str(
                folderiterator) + "/" + temp2 + str(
                    fileiterator) + "/" + channel_name + ".txt"
            if not os.path.exists(filePath):
                if not ((folderiterator == 2 and
                         (fileiterator == 29 or fileiterator == 30
                          or fileiterator == 31)) or
                        ((folderiterator == 4 or folderiterator == 6
                          or folderiterator == 9 or folderiterator == 11)
                         and fileiterator == 31)):
                    print "[Error] Path " + filePath + " doesn't exist"
                continue
            with open(filePath) as f:
                content = f.readlines(
                )  #contents stores all the lines of the file channel_name

            # print "Analysing ",filePath

            nicks = []  #list of all the nicknames
            '''
				Getting all the nicknames in a list nicks[]
			'''
            for i in content:
                if (i[0] != '=' and "] <" in i and "> " in i):
                    m = re.search(r"\<(.*?)\>", i)
                    if m.group(0) not in nicks:
                        nicks.append(
                            m.group(0)
                        )  #used regex to get the string between <> and appended it to the nicks list

            for i in xrange(0, len(nicks)):
                nicks[i] = nicks[i][1:-1]  #removed <> from the nicknames

            for i in xrange(0, len(nicks)):
                nicks[i] = ext.util.correctLastCharCR(nicks[i])

            for line in content:
                if (
                        line[0] == '=' and "changed the topic of" not in line
                ):  #excluding the condition when user changes the topic. Search for only nick changes
                    nick1 = ext.util.correctLastCharCR(
                        line[line.find("=") + 1:line.find(" is")][3:])
                    nick2 = ext.util.correctLastCharCR(
                        line[line.find("wn as") + 1:line.find("\n")][5:])
                    if nick1 not in nicks:
                        nicks.append(nick1)
                    if nick2 not in nicks:
                        nicks.append(nick2)

            #print("printing nicks***********************************")
            #print(nicks)
            '''
				Forming list of lists for avoiding nickname duplicacy
			'''

            for line in content:
                if (line[0] == '=' and "changed the topic of" not in line):
                    line1 = line[line.find("=") + 1:line.find(" is")][3:]
                    line2 = line[line.find("wn as") + 1:line.find("\n")][5:]
                    line1 = ext.util.correctLastCharCR(line1)
                    line2 = ext.util.correctLastCharCR(line2)
                    for i in range(5000):
                        if line1 in nick_same_list[
                                i] or line2 in nick_same_list[i]:
                            nick_same_list[i].append(line1)
                            nick_same_list[i].append(line2)
                            break
                        if not nick_same_list[i]:
                            nick_same_list[i].append(line1)
                            nick_same_list[i].append(line2)
                            break
            #print("printing nick_same_list****************************")
            #print(nick_same_list)
            for line in content:
                flag_comma = 0
                if (line[0] != '=' and "] <" in line and "> " in line):
                    m = re.search(r"\<(.*?)\>", line)
                    var = m.group(0)[1:-1]
                    var = ext.util.correctLastCharCR(var)
                    for d in range(len(nicks)):
                        if var in nick_same_list[d]:
                            nick_sender = nick_same_list[d][0]
                            break
                        else:
                            nick_sender = var

                    nick_receiver = ''
                    for i in nicks:
                        rec_list = [e.strip() for e in line.split(':')
                                    ]  #receiver list splited about :
                        rec_list[1] = rec_list[1][rec_list[1].find(">") +
                                                  1:len(rec_list[1])]
                        rec_list[1] = rec_list[1][1:]
                        if not rec_list[1]:  #index 0 will contain time 14:02
                            break
                        for k in xrange(0, len(rec_list)):
                            if (rec_list[k]):  #checking for \
                                rec_list[k] = ext.util.correctLastCharCR(
                                    rec_list[k])
                        for z in rec_list:
                            if (z == i):
                                if (var != i):
                                    for d in range(len(nicks)):
                                        if i in nick_same_list[d]:
                                            nick_receiver = nick_same_list[d][
                                                0]
                                            break
                                        else:
                                            nick_receiver = i

                        if "," in rec_list[
                                1]:  #receiver list may of the form <Dhruv> Rohan, Ram :
                            flag_comma = 1
                            rec_list_2 = [
                                e.strip() for e in rec_list[1].split(',')
                            ]
                            for y in xrange(0, len(rec_list_2)):
                                if (rec_list_2[y]):  #checking for \
                                    rec_list_2[y] = ext.util.correctLastCharCR(
                                        rec_list_2[y])
                            for j in rec_list_2:
                                if (j == i):
                                    if (var != i):
                                        for d in range(len(nicks)):
                                            if i in nick_same_list[d]:
                                                nick_receiver = nick_same_list[
                                                    d][0]
                                                break
                                            else:
                                                nick_receiver = i

                        if (flag_comma == 0
                            ):  #receiver list can be <Dhruv> Rohan, Hi!
                            rec = line[line.find(">") + 1:line.find(", ")]
                            rec = rec[1:]
                            rec = ext.util.correctLastCharCR(rec)
                            if (rec == i):
                                if (var != i):
                                    for d in range(len(nicks)):
                                        if i in nick_same_list[d]:
                                            nick_receiver = nick_same_list[d][
                                                0]
                                            break
                                        else:
                                            nick_receiver = i

                    #generating the words written by the sender
                    message = rec_list[1:]
                    no_messages += 1
                    correctedNickReciever = correctNickFor_(nick_receiver)
                    if correctedNickReciever in message:
                        message.remove(correctedNickReciever)
                    # print nick_sender, "Message", ":".join(message), "end"

                    lmtzr = WordNetLemmatizer()
                    #limit word size = 3, drop numbers.
                    word_list_temp = re.sub(
                        r'\d+', '', " ".join(
                            re.findall(r'\w{3,}', ":".join(message).replace(
                                ",", " ")))).split(" ")
                    word_list = []
                    #remove punctuations
                    for word in word_list_temp:
                        word = word.lower()
                        word_list.append(word.replace("'", ""))
                    word_list_lemmatized = []
                    try:
                        word_list_lemmatized = map(
                            lmtzr.lemmatize,
                            map(lambda x: lmtzr.lemmatize(x, 'v'), word_list))
                    except UnicodeDecodeError:
                        pass
                    # word_list_lemmatized = [ unicode(s) for s in word_list_lemmatized]
                    # print "=====>original", word_list
                    # print "===>lemmatized", word_list_lemmatized

                    fr = 1
                    for dic in user_words_dict:
                        if dic['sender'] == nick_sender:
                            # print '1========',word_list_lemmatized
                            dic['words'].extend(word_list_lemmatized)
                            fr = 0
                    if fr:
                        # print '2========',word_list_lemmatized
                        user_words_dict.append({
                            'sender': nick_sender,
                            'words': word_list_lemmatized
                        })

    nicks_for_stop_words = []
    stop_word_without_apostrophe = []

    for l in nick_same_list:
        nicks_for_stop_words.extend(l)

    for dictonary in user_words_dict:
        nicks_for_stop_words.append(dictonary['sender'])

    nicks_for_stop_words.extend([x.lower() for x in nicks_for_stop_words])

    for words in common_english_words.words:
        stop_word_without_apostrophe.append(words.replace("'", ""))

    stop_words_extended = text.ENGLISH_STOP_WORDS.union(
        common_english_words.words).union(nicks_for_stop_words).union(
            stop_word_without_apostrophe).union(custom_stop_words.words).union(
                custom_stop_words.slangs)
    count_vect = CountVectorizer(analyzer='word',
                                 stop_words=stop_words_extended,
                                 min_df=1)

    for dictonary in user_words_dict:
        # print dictonary['sender']
        # print dictonary['words']
        try:
            matrix = count_vect.fit_transform(dictonary['words'])
            freqs = [[word, matrix.getcol(idx).sum()]
                     for word, idx in count_vect.vocabulary_.items()]
            keywords = sorted(freqs, key=lambda x: -x[1])
            # print 'Nick:', dictonary['sender']
            total_freq = 0.0
            for freq_tuple in keywords:
                total_freq += freq_tuple[1]
            # print total_freq

            for freq_tuple in keywords:
                freq_tuple.append(round(freq_tuple[1] / float(total_freq), 5))
            user_keyword_freq_dict.append({
                'nick': dictonary['sender'],
                'keywords': keywords
            })

            # print 'Keywords: (Format : [<word>, <frequency>, <normalised_score>])'
            # print keywords
            # print "\n"
        except ValueError:
            pass

    # print user_keyword_freq_dict
    # print dataForNick(user_keyword_freq_dict, 'BluesKaj', 0.01)
    for data in user_keyword_freq_dict:
        keywords, normal_scores = dataForNick(user_keyword_freq_dict,
                                              data['nick'], 0.01, 100)
        # print "Nick:", data['nick']
        # print "Keywords with normalised score > 0.01\n", keywords
        # print "Their Normal scores\n", normal_scores
        # print "\n"
        if keywords:
            keywords_filtered.append({
                'nick': data['nick'],
                'keywords': keywords
            })

    # print "KEYWORDS!"
    # print keywords_filtered
    # print "DICT"
    # print user_keyword_freq_dict
    print str(startingMonth) + "\t" + str(no_messages) + "\t" + str(
        len(user_words_dict))
    return keywords_filtered, user_keyword_freq_dict, user_words_dict, nicks_for_stop_words
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from wordcloud import WordCloud
from textblob import TextBlob

data_df['tokenized_tweet'] = data_df['clean_text'].apply(
    lambda x: word_tokenize(x))
stop_words = set(stopwords.words('english'))
data_df['tweet_token_filter'] = data_df['tokenized_tweet'].apply(
    lambda x: [word for word in x if not word in stop_words])
lemmatizing = WordNetLemmatizer()
data_df['tweet_lemmatized'] = data_df['tweet_token_filter'].apply(
    lambda x: ' '.join([lemmatizing.lemmatize(i) for i in x]))
data_df['sentiment_lemmatized'] = data_df['tweet_lemmatized'].apply(
    lambda x: TextBlob(x).sentiment)
data_df[['sentiment_lemmatized', 'tweet_lemmatized']].head(10)

all_words = ' '.join([text for text in data_df['tweet_lemmatized']])
wordcloud = WordCloud(width=800,
                      height=500,
                      random_state=21,
                      max_font_size=110).generate(all_words)

plt.figure(figsize=(10, 7))
plt.imshow(wordcloud, interpolation="bilinear")
plt.axis('off')
plt.title("Most Common words in column Tweet Lemmatized")
plt.show()
Ejemplo n.º 41
0
def keywords(log_dict, nicks, nick_same_list):
    """
    Returns keywods for all users

    Args:   
        log_dict (str): Dictionary of logs data created using reader.py
        nicks(List) : list of nickname created using nickTracker.py
        nick_same_list :List of same_nick names created using nickTracker.py

    Returns
        keywords_filtered: filtered keywords for user
        user_keyword_freq_dict: dictionary for each user having keywords and their frequency
        user_words_dict: keywods for user
        nicks_for_stop_words: stop words
    """

    user_words_dict = []
    user_keyword_freq_dict = []
    keywords_filtered = []
    no_messages = 0

    def get_nick_receiver(nick_receiver, rec, nick_to_compare, nick_name,
                          nicks, nick_same_list):
        if (rec == nick_name):
            if (nick_to_compare != nick_name):
                nick_receiver = util.get_nick_representative(
                    nicks, nick_same_list, nick_name)
        return nick_receiver

    for day_content_all_channels in log_dict.values():
        for day_content in day_content_all_channels:
            day_log = day_content["log_data"]
            for line in day_log:
                flag_comma = 0
                if (util.check_if_msg_line(line)):
                    m = re.search(r"\<(.*?)\>", line)
                    nick_to_compare = util.correctLastCharCR(
                        (m.group(0)[1:-1]))
                    nick_sender = ''
                    nick_sender = util.get_nick_representative(
                        nicks, nick_same_list, nick_to_compare)

                    nick_receiver = ''
                    for nick_name in nicks:
                        rec_list = [e.strip() for e in line.split(':')
                                    ]  #receiver list splited about :
                        rec_list = util.rec_list_splice(rec_list)

                        if not rec_list[1]:  #index 0 will contain time 14:02
                            break
                        rec_list = util.correct_last_char_list(rec_list)
                        for rec in rec_list:
                            nick_receiver = get_nick_receiver(
                                nick_receiver, rec, nick_to_compare, nick_name,
                                nicks, nick_same_list)

                        if "," in rec_list[
                                1]:  #receiver list may of the form <Dhruv> Rohan, Ram :
                            flag_comma = 1
                            rec_list_2 = [
                                e.strip() for e in rec_list[1].split(',')
                            ]
                            rec_list_2 = util.correct_last_char_list(
                                rec_list_2)
                            for rec in rec_list_2:
                                nick_receiver = get_nick_receiver(
                                    nick_receiver, rec, nick_to_compare,
                                    nick_name, nicks, nick_same_list)

                        if (flag_comma == 0
                            ):  #receiver list can be <Dhruv> Rohan, Hi!
                            rec = util.splice_find(line, ">", ", ", 1)
                            nick_receiver = get_nick_receiver(
                                nick_receiver, rec, nick_to_compare, nick_name,
                                nicks, nick_same_list)

                    #generating the words written by the sender
                    message = rec_list[1:]
                    no_messages += 1
                    correctedNickReciever = util.correct_nick_for_(
                        nick_receiver)
                    if correctedNickReciever in message:
                        message.remove(correctedNickReciever)

                    lmtzr = WordNetLemmatizer()

                    #limit word size = 3, drop numbers.
                    word_list_temp = re.sub(
                        r'\d+', '', " ".join(
                            re.findall(r'\w{3,}', ":".join(message).replace(
                                ",", " ")))).split(" ")
                    word_list = []

                    #remove punctuations
                    for word in word_list_temp:
                        word = word.lower()
                        word_list.append(word.replace("'", ""))
                    word_list_lemmatized = []

                    try:
                        word_list_lemmatized = map(
                            lmtzr.lemmatize,
                            map(lambda x: lmtzr.lemmatize(x, 'v'), word_list))
                    except UnicodeDecodeError:
                        pass

                    fr = 1
                    for dic in user_words_dict:
                        if dic['sender'] == nick_sender:
                            dic['words'].extend(word_list_lemmatized)
                            fr = 0
                    if fr:
                        user_words_dict.append({
                            'sender': nick_sender,
                            'words': word_list_lemmatized
                        })

    nicks_for_stop_words = []
    stop_word_without_apostrophe = []

    for l in nick_same_list:
        nicks_for_stop_words.extend(l)

    for dictonary in user_words_dict:
        nicks_for_stop_words.append(dictonary['sender'])

    nicks_for_stop_words.extend([x.lower() for x in nicks_for_stop_words])

    for words in common_english_words.words:
        stop_word_without_apostrophe.append(words.replace("'", ""))

    stop_words_extended = extended_stop_words(nicks_for_stop_words,
                                              stop_word_without_apostrophe)

    count_vect = CountVectorizer(analyzer='word',
                                 stop_words=stop_words_extended,
                                 min_df=1)
    keywords_for_channels = []
    for dictonary in user_words_dict:
        try:
            matrix = count_vect.fit_transform(dictonary['words'])
            freqs = [[word, matrix.getcol(idx).sum()]
                     for word, idx in count_vect.vocabulary_.items()]
            keywords = sorted(freqs, key=lambda x: -x[1])
            total_freq = 0.0
            for freq_tuple in keywords:
                total_freq += freq_tuple[1]

            for freq_tuple in keywords:
                freq_tuple.append(round(freq_tuple[1] / float(total_freq), 5))
            user_keyword_freq_dict.append({
                'nick': dictonary['sender'],
                'keywords': keywords
            })
            keywords_for_channels.extend(keywords)
        except ValueError:
            pass
    for data in user_keyword_freq_dict:
        keywords, normal_scores = top_keywords_for_nick(
            user_keyword_freq_dict, data['nick'], config.KEYWORDS_THRESHOLD,
            config.KEYWORDS_MIN_WORDS)
        if config.DEBUGGER and config.PRINT_WORDS:
            print "Nick:", data['nick']
            print "Keywords with normalised score > 0.01\n", keywords
            print "Their Normal scores\n", normal_scores
            print "\n"
        if keywords:
            keywords_filtered.append({
                'nick': data['nick'],
                'keywords': keywords
            })

    return keywords_filtered, user_keyword_freq_dict, user_words_dict, nicks_for_stop_words, sorted(
        keywords_for_channels, key=lambda x: x[2], reverse=True)
Ejemplo n.º 42
0
pt = PorterStemmer()

from nltk.stem.snowball import EnglishStemmer

sb = EnglishStemmer()

from nltk.stem.wordnet import WordNetLemmatizer

wn = WordNetLemmatizer()

##let's examine the word ``better"
st.stem('better')
pt.stem('better')
sb.stem('better')
wn.lemmatize('better', 'a')

wn.lemmatize('families', 'n')

##
##applying the porter stemmer to the gettysburg address

text_5 = list(map(pt.stem, text_3))

##now creating a dictionary that will count the occurrence of the words

getty = {}
used = []
for word in text_5:
    if word in getty:
        getty[word] += 1
Ejemplo n.º 43
0
    filtered_sent = []
    for w in tokenized_sent:
        if w not in tokenized_sent:
            filtered_sent.append(w)
    print("Tokenized Sentence:", tokenized_sent)
    print("Filterd Sentence:", filtered_sent)

    ps = PorterStemmer()
    stemmed_words = []
    for w in filtered_sent:
        stemmed_words.append(ps.stem(w))

    print("Filtered Sentence:", filtered_sent)
    print("Stemmed Sentence:", stemmed_words)

    lem = WordNetLemmatizer()
    stem = PorterStemmer()
    word = "flying"
    print("Lemmatized Word:", lem.lemmatize(word, "v"))
    print("Stemmed Word:", stem.stem(word))

    text = """Hello Mr. Smith, how are you doing today? The weather is great, and city is awesome.
    The sky is pinkish-blue. You shouldn't eat cardboard"""
    print_text_sentence_tokenization(text)

    sent = "Albert Einstein was born in Ulm, Germany in 1879."
    tokens = nltk.word_tokenize(sent)
    print(tokens)
    print(nltk.pos_tag(tokens))
class LemmaTokenizer(object):
    def __init__(self):
        self.wnl = WordNetLemmatizer()

    def __call__(self, articles):
        return [self.wnl.lemmatize(t) for t in word_tokenize(articles)]
Ejemplo n.º 45
0
if __name__ == '__main__':
    driver = QABase()

    # Get the first question and its story
    q = driver.get_question("fables-01-1")
    story = driver.get_story(q["sid"])

    # get the dependency graph of the first question
    qgraph = q["dep"]
    #print("qgraph:", qgraph)

    # The answer is in the second sentence
    # You would have to figure this out like in the chunking demo
    sgraph = story["sch_dep"][1]

    # TODO: send in the correct sentence!!!!!

    lmtzr = WordNetLemmatizer()
    for node in sgraph.nodes.values():
        tag = node["tag"]
        word = node["word"]
        if word is not None:
            if tag.startswith("V"):
                print(lmtzr.lemmatize(word, 'v'))
            else:
                print(lmtzr.lemmatize(word, 'n'))
    print()

    answer = find_answer(qgraph, sgraph)
    print("answer:", answer)
Ejemplo n.º 46
0
def textProcessing(dataset):
    tokens = []
    sentencesTokenize = sent_tokenize(dataset)
    for item in sentencesTokenize:
        tokens.append(word_tokenize(item.lower()))
    sentences = copy.deepcopy(tokens)

    englishStopwords = stopwords.words('english')
    punctuations = list(string.punctuation)

    for i in range(len(tokens)):
        tokens[i] = ' '.join(tokens[i])
        tokens[i] = re.sub(r"i'm", "i am", tokens[i])
        tokens[i] = re.sub(r"n't", "not", tokens[i])
        tokens[i] = re.sub(r"n 't", "not", tokens[i])
        tokens[i] = re.sub(r"n' t", "not", tokens[i])
        tokens[i] = re.sub(r"he's", "he is", tokens[i])
        tokens[i] = re.sub(r"she's", "she is", tokens[i])
        tokens[i] = re.sub(r"that's", "that is", tokens[i])
        tokens[i] = re.sub(r"what's", "what is", tokens[i])
        tokens[i] = re.sub(r"where's", "where is", tokens[i])
        tokens[i] = re.sub(r"\'ll", " will", tokens[i])
        tokens[i] = re.sub(r"\'ve", " have", tokens[i])
        tokens[i] = re.sub(r"\'re", " are", tokens[i])
        tokens[i] = re.sub(r"\'d", " would", tokens[i])
        tokens[i] = re.sub(r"won't", "will not", tokens[i])
        tokens[i] = re.sub(r"can't", "cannot", tokens[i])
        tokens[i] = re.sub(r"don't", "do not", tokens[i])

        tokens[i] = "".join(ch for ch in tokens[i] if ch not in punctuations)

    Tokens = []
    for item in tokens:
        Tokens.append(word_tokenize(item.lower()))

    for i in range(len(Tokens)):
        Tokens[i] = [
            value for value in Tokens[i] if value not in englishStopwords
        ]

    pos_tags = {
        NOUN: ['NN', 'NNS', 'NNP', 'NNPS', 'PRP', 'PRP$', 'WP', 'WP$'],
        VERB: ['VB', 'VBD', 'VBG', 'VBN', 'VBP', 'VBZ'],
        ADJ: ['JJ', 'JJR', 'JJS'],
        ADV: ['RB', 'RBR', 'RBS', 'WRB']
    }

    tagged_words = []
    for token in Tokens:
        tagged_words.append(nltk.pos_tag(token))

    pos_word = []
    pos_words = []

    for i in range(len(tagged_words)):
        pos_word = []
        for j in range(len(tagged_words[i])):

            flag = False
            #         pos_words.append([])
            for key, value in pos_tags.items():
                if tagged_words[i][j][1] in value:
                    pos_word.append((tagged_words[i][j], key))
                    flag = True
                    break
        if not flag:
            pos_word.append((tagged_words[i][j], NOUN))
        pos_words.append(pos_word)

    normalized_words = []
    lem = WordNetLemmatizer()
    for i in range(len(pos_words)):
        normalized_words.append(
            [lem.lemmatize(w[0], pos=p) for w, p in pos_words[i]])

    return normalized_words, sentences
Ejemplo n.º 47
0
en_stop = en_stop + customList6 + customList7 + customList8 + customList9 + customList10

p_stemmer = PorterStemmer()
lmtzr = WordNetLemmatizer()

doc_set = ubuntu_rss_list

texts = []

# Generate topics using LDA
for i in doc_set:

    raw = i.lower()
    tokens = tokenizer.tokenize(raw)
    stopped_tokens = [i for i in tokens if not i in en_stop]
    stemmed_tokens = [lmtzr.lemmatize(i) for i in stopped_tokens]
    texts.append(stemmed_tokens)

dictionary = corpora.Dictionary(texts)
corpus = [dictionary.doc2bow(text) for text in texts]
ldamodel = gensim.models.ldamodel.LdaModel(corpus,
                                           num_topics=10,
                                           id2word=dictionary,
                                           passes=5)

for i in ldamodel.show_topics():
    print(i[0], i[1])

end_time = time()
time_taken = end_time - start_time
Ejemplo n.º 48
0
# create lemmatizer
lem = WordNetLemmatizer()

#stopword
stop_words = set(stopwords.words('english'))

df = pd.read_csv('textm_wine_reviews.csv', nrows=500, delimiter=',')
label_data = pd.DataFrame(df, columns=['winery','province','country'])
label = list(map(lambda x: ', '.join(x), label_data.values))

input_search = [
  ' memory of a wine once made by his mother'
]

list_document = list()
vectorizer = CountVectorizer()

for i in range(0, len(df)):
  tokenized_word=word_tokenize(df.loc[i][2])
  filter_words = [w for w in tokenized_word if not w in stop_words]
  stemmed_word = list(map(lambda x: stemmer.stem(x), filter_words))
  lemmatize_word = list(map(lambda x : lem.lemmatize(x, 'v'), stemmed_word))
  output = ' '.join(lemmatize_word)
  X = vectorizer.fit_transform([output])
  print(X.toarray())
  Y = vectorizer.transform(input_search)
  list_document.append([label[i], sum(Y.data)])

pprint.pprint(sorted(list_document, key = lambda x: x[1], reverse = True)[0:7])
end = time.time()
print('exe time Count Vectorizer: ', end-start)
Ejemplo n.º 49
0
# Original paper: http://web.simmons.edu/~benoit/lis466/PorterStemmingAlgorithm.pdf
porter = PorterStemmer()
print(types)
print([porter.stem(x) for x in types])

print(porter.stem('city'))

types = [
    'bed', 'kiss', 'tied', 'tis', 'universal', 'university', 'experiment',
    'experience', 'past', 'paste', 'alumnus', 'alumni', 'adhere', 'adhesion',
    'create', 'creation'
]
porter_results = [porter.stem(x) for x in types]
print(porter_results)

from nltk.stem.wordnet import WordNetLemmatizer
# See description: https://wordnet.princeton.edu/wordnet/man/morphy.7WN.html
lemm = WordNetLemmatizer()
lemm_results = [lemm.lemmatize(x) for x in types]
print('%15s\t%15s\t%15s' % ('type', 'porter', 'lemmatizer'))
print('\n')
print('\n'.join([
    '%15s\t%15s\t%15s' % (t[0], t[1], t[2])
    for t in zip(types, porter_results, lemm_results)
]))

print(lemm.lemmatize('are'))
print(lemm.lemmatize('is'))

print(lemm.lemmatize('are', 'v'))
print(lemm.lemmatize('is', 'v'))
Ejemplo n.º 50
0
DATA_DIR = "MachineLearningGroupProject/data/"
data = load_files(DATA_DIR, encoding='utf-8', decode_error='replace')
labels, counts = np.unique(data.target, return_counts=True)
labels_str = np.array(data.target_names)[labels]
print(dict(zip(labels_str, counts)))

#Tokenise and lemmatise the text data
nltk.download('wordnet')
lemmatiser = WordNetLemmatizer()
tokeniser = CountVectorizer().build_tokenizer()
for i in range(0, len(data.data)):
    temp_str = " "
    data.data[i] = data.data[i].lower()
    data.data[i] = tokeniser(data.data[i])
    for token in range(0, len(data.data[i])):
        data.data[i][token] = lemmatiser.lemmatize(data.data[i][token])
    data.data[i] = temp_str.join(data.data[i])


X_train, X_test, Y_train, Y_test = train_test_split(data.data, data.target)
# print(X_test);
vectoriser = CountVectorizer(ngram_range=(1, 2), token_pattern=r'\b\w+\b', min_df=1)
x_train_counts = vectoriser.fit_transform(X_train)
tf_transformer = TfidfTransformer(use_idf=False).fit(x_train_counts)
x_train_tf = tf_transformer.transform(x_train_counts)

model = LinearSVC()
model.fit(x_train_tf, Y_train)

y_pred = model.predict(vectoriser.transform(X_test))
print(accuracy_score(Y_test, y_pred))
Ejemplo n.º 51
0
def lemmatize_verb(verbs):
    lemmatizer = WordNetLemmatizer()
    return map(lambda x:lemmatizer.lemmatize(x, 'v'),verbs)
Ejemplo n.º 52
0
		# List of valid lemmas included in current query
		# query        : Project Gutenberg Literacy Archive Foundation
		# query_lemmas : project gutenberg archive foundation
		query_lemmas = []

		for word, pos in pos_tag(wp_tokenizer.tokenize(query.lower().strip())):
			# It is proper to sanitize the query like we sanitized the documents documents when we built the index by stemming all the words, making everything lowercase, removing punctuation and apply the analysis applied while building the index.
			if(
				pos in CLOSED_TAGS or						# search the closed tag set O(1)
				pattern.search(word) or						# If includes a non-letter character
				word in stop_words							# search for stop words O(1)
			):
				continue

			pos = 'v' if (pos.startswith('VB')) else 'n'	# If current term's appearance is verb related then the POS lemmatizer should be verb ('v'), otherwise ('n')
			if (word in inverted_file):
				query_lemmas.append(wnl_lemmatizer.lemmatize(word, pos))		# Stemming/Lemmatization

		if (len(query_lemmas) < 1):
			print "Querying: No relevant document!"
			continue

		# Standard query: After sanitizing/wrangling the input query we retrieve the inverted list of the remaining terms/lemmas and which we aggregate and union them.
		standard_query(query_lemmas)

		# Phrase query: After sanitizing/wrangling the input query we run a single word query for every lemma found and add each of these of results to our total list. We 'common_documents' the setted list that contains all the documents that contain all the words in the query.
		# Then we check them for ordering. So, for every list in the intermediate results, we first make a list of lists of the positions of each wordd in the input query. Then we use two nested for loops to iterate through this list of lists. If the words are in the proper order, 
		phrase_query(query_lemmas)

	sys.exit(0)
Ejemplo n.º 53
0
 


def _remove_regex(input_text, regex_pattern):
		urls = re.finditer(regex_pattern, input_text)
		for i in urls:
				input_text = re.sub(i.group().strip(), '', input_text)
		return input_text
regex_pattern = "#[\w]*" 
_remove_regex("remove this #hashtag from analytics vidhya", regex_pattern)


lem 	= WordNetLemmatizer()
stem 	= PorterStemmer()
word 	= "multiplying" 
lem.lemmatize(word, "v")
stem.stem(word)


lookup_dict = {'rt':'Retweet', 'dm':'direct message', "awsm" : "awesome", "luv" :"love"}
def _lookup_words(input_text):
		words = input_text.split() 
		new_words = [] 
		for word in words:
				if word.lower() in lookup_dict:
						word = lookup_dict[word.lower()]
				new_words.append(word) 
				new_text = " ".join(new_words) 
				return new_text
_lookup_words("RT this is a retweeted tweet by Shivam Bansal")
Ejemplo n.º 54
0
    text = text.lower()

    # remove tags
    text = re.sub("&lt;/?.*?&gt;", " &lt;&gt; ", text)

    # Remove special characters and digits
    text = re.sub("(\\d|\\W)+", " ", text)

    ## Convert to list from string
    text = text.split()

    ## Stemming
    ps = PorterStemmer()
    # Lemmatisation
    lem = WordNetLemmatizer()
    text = [lem.lemmatize(word) for word in text if not word in stop_words]
    text = " ".join(text)
    corpus.append(text)

#Word cloud
wordcloud = WordCloud(background_color='white',
                      stopwords=stop_words,
                      max_words=100,
                      max_font_size=50,
                      random_state=42).generate(str(corpus))
print(wordcloud)
fig = plt.figure(1)
plt.imshow(wordcloud)
plt.axis('off')
plt.show()
fig.savefig("word2.jpg", dpi=900)
Ejemplo n.º 55
0
 def lemmatize(self, tokens):
     lemmatizer = WordNetLemmatizer()
     return [lemmatizer.lemmatize(token) for token in tokens]
Ejemplo n.º 56
0
from nltk.corpus import wordnet, stopwords
from nltk.stem.wordnet import WordNetLemmatizer
import pickle
from collections import defaultdict
import pdb

stopWords = set(stopwords.words("english"))
lmtzr = WordNetLemmatizer()

en_vocab_file = open("../created_datas/en.vocab")
en_vocab_with_senses = {}
en_word_senses = defaultdict(
    list)  #to keep all senses of a word in a dictionary

for line in en_vocab_file:
    word = lmtzr.lemmatize(line.strip().lower())
    senses = wordnet.synsets(line.strip())

    if senses:
        for sense in senses:
            en_word_senses[word].append(sense.name())
            en_vocab_with_senses[sense.name()] = []

with open("../created_datas/en_vocab_with_senses.pkl", "wb") as fw:
    pickle.dump(en_vocab_with_senses, fw)

with open("../created_datas/en_words_wsynsets.pkl", "wb") as ws:
    pickle.dump(en_word_senses, ws)
Ejemplo n.º 57
0
# number removal

spotify_file['Review'] = spotify_file['Review'].str.replace('\d+', '')
spotify_file['Review'].head()


# Lemmatization

import nltk
nltk.download('wordnet')
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.corpus import wordnet

lemmatizer = WordNetLemmatizer()

spotify_file['Review']= spotify_file['Review'].apply(lambda x: ' '.join([lemmatizer.lemmatize(word,'n')for word in x.split()]))
spotify_file['Review']= spotify_file['Review'].apply(lambda x: ' '.join([lemmatizer.lemmatize(word,'v')for word in x.split()]))
print(spotify_file['Review'])

# spelling correction
from autocorrect import spell 
spotify_file['Review']= spotify_file['Review'].apply(lambda x: " ".join([spell(i) for i in x.split()]))

#replace words (depends on how word changes)

spotify_file.Review = spotify_file.Review.str.replace('app', 'application')
spotify_file.Review = spotify_file.Review.str.replace('specify', 'spotify')


# Tokenization
Ejemplo n.º 58
0
f.write(obj)
word_tokens = list(word_tokenize(obj))



# filtered_sentence = [w for w in word_tokens if not w in stop_words]



filtered_sentence = []

for w in word_tokens:
    if w not in stop_words:
        filtered_sentence.append(w)

t =[ lem.lemmatize(i.lower(), pos='v') for i in filtered_sentence]
# t=filtered_sentence
# print(t)

filtered_sentence = []
for w in t:
	if w in keywords:
		filtered_sentence.append(w)

# print("Extracted keywords-->"+str(filtered_sentence))
# print('*'*30)
for w in filtered_sentence:
	if w in kw[0]:
		freq[0] += 1
	if w in kw[1]:
		freq[1] += 1
Ejemplo n.º 59
0
#import nltk

#path = "/media/mynewdrive/new_txt"
#dirList = os.listdir(path)

path = "/media/mynewdrive/pos_story.txt"

f = open(path, "r")

a = WordNetLemmatizer()

for line in f:
    nline = line.rstrip()
    sep = nline.partition(" ")
    if sep[2] == "VERB":
        print a.lemmatize(sep[0], 'v') + " " + sep[2]
    elif sep[2] == "NOUN":
        print a.lemmatize(sep[0]) + " " + sep[2]
    elif sep[2] == "ADJ":
        print a.lemmatize(sep[0], 'a') + " " + sep[2]
    elif sep[2] == "ADV":
        print a.lemmatize(sep[0], 'r') + " " + sep[2]

#while 1:
#try:

#except line == KeyboardInterrupt:
#	break

#if not line:
#	break
Ejemplo n.º 60
0
"""

rdfGraph = Graph()

for triple in triples:
    sentence = str(triple).split(",")[1].split("=")[1][1:-1] + " " + str(
        str(triple).split(",")[2].split("=")[1])[1:-1] + " " + str(
            str(triple).split(",")[3].split("=")[1])[1:-2]

    s = str(triple).split(",")[1].split("=")[1][1:-1].lower()
    p = str(str(triple).split(",")[2].split("=")[1])[1:-1].lower()
    o = str(str(triple).split(",")[3].split("=")[1])[1:-2].lower()
    """
    Post-processing the triples
    """
    modified_p = lemmatizer.lemmatize(p, 'v') + " " + str(o.split(" ")[0])
    if (modified_p in white_dict.keys() or modified_p in type_dict.keys()):
        p = modified_p
        o = " ".join(o.split(" ")[1:])
    """
    subject checking 
    """
    nps = []
    doc = nlp(s)
    for np in doc.noun_chunks:
        nps.append(np.text)
    '''
    Substituting the DBPedia Spotlight Resource URLS
    '''
    subjects = []
    if (len(nps) > 0):