Example #1
0
def uniqueWord (train_set, test_set):
	exclude = set(string.punctuation)
	lmtzr = WordNetLemmatizer()
	for element1 in train_set:
		temp_list1 = element1.decode('utf-8').split()
		for word1 in temp_list1:
			word1 = lmtzr.lemmatize(word1)
			word1 = ''.join(ch for ch in word1 if ch not in exclude)
			if not word1 in unique_word1:
				unique_word1.append(word1)

	for element2 in test_set:
		temp_list2 = element2.split()
		temp_list2 = element2.decode('utf-8').split()
		for word2 in temp_list2:
			non_unique_word2.append(word2)

	dictionary.setdefault("list", []).append("list_item")
	for element in unique_word1:
		dictionary.update({element:1})

	for e in non_unique_word2:
		if dictionary.has_key(e):
			dictionary[e] += 1

	sorted_dic = sorted(dictionary.items(), key = operator.itemgetter(1))

	return sorted_dic
Example #2
0
 def _VERBAL_PREDICATE_FEATURE_Lemma(self):
     from nltk.stem.wordnet import WordNetLemmatizer
     lmtzr = WordNetLemmatizer()
     if self.pos in pos_penn_to_wordnet:
         return lmtzr.lemmatize(self.word, pos_penn_to_wordnet[self.pos])
     else:
         return False
def get_2prev_pos_lemma_verb(arg_clauses, clause_index, parse_dict):
    DocID = arg_clauses.DocID
    sent_index = arg_clauses.sent_index

    verb_pos = ["VB", "VBD", "VBG", "VBN", "VBP", "VBZ"]
    curr_clause_indices = arg_clauses.clauses[clause_index][0]# ([1,2,3],yes)

    lmtzr = WordNetLemmatizer()

    first_verb = ""
    first_verb_index = 0
    for index in curr_clause_indices:
        word = parse_dict[DocID]["sentences"][sent_index]["words"][index][0]
        pos = parse_dict[DocID]["sentences"][sent_index]["words"][index][1]["PartOfSpeech"]
        if pos in verb_pos:
            word = lmtzr.lemmatize(word)
            first_verb = (word, index)
            break
        first_verb_index += 1
    if first_verb == "":
        return "NULL|NULL|NULL"
    if first_verb_index == 0:
        return "%s|%s|%s" % ("NULL", "NULL", first_verb[0])
    if first_verb_index == 1:
        prev1_pos = parse_dict[DocID]["sentences"][sent_index]["words"][first_verb[1] - 1][1]["PartOfSpeech"]
        return "%s|%s|%s" % ("NULL", prev1_pos, first_verb[0])

    prev1_pos = parse_dict[DocID]["sentences"][sent_index]["words"][first_verb[1] - 1][1]["PartOfSpeech"]
    prev2_pos = parse_dict[DocID]["sentences"][sent_index]["words"][first_verb[1] - 2][1]["PartOfSpeech"]
    return "%s|%s|%s" % (prev2_pos, prev1_pos, first_verb[0])
Example #4
0
def parseLine(line, stopWords_, wordInd, currWrd):
    """ Removes stop words and lemmas using nltk and punctuations 
    using re. Returns a list with valid words in the line. currWrd is
    the index of next word occurring for the first time
    """
    lineWords = []
    # Hypen in hyphenated words are removed e.g. wi-fi ==> wifi.
    line = re.sub('(\w)-(\w)',r'\1\2',line)
    # replace underscore with space     
    line = re.sub('(\w)_(\w)',r'\1 \2',line)    
    # Remove punctuation marks.
    line = re.sub("[',~`@#$%^&*|<>{}[\]\\\/.:;?!\(\)_+\"-]",r'',line)
    wnLmtzr = WordNetLemmatizer()    
    for word in line.split():
        # Get index of word from wordInd. If it is seen for the first 
        # time assign an index to the word.
        word = word.lower()    # case of words is ignored
        # Lemmatize word using word net function
        word = wnLmtzr.lemmatize(word, 'n')    # with noun
        word1 = wnLmtzr.lemmatize(word, 'v')    # with verb
        if len(word1) < len(word):    # select smaller of two
            word = word1                
        # Ignore stop words and numbers.
        if word in stopWords_ or \
                re.match('^\d+x?\d*$',word) is not None:
            continue
        # Update wordInd with number of occurrences of word.
        if word not in wordInd:                
            wordInd[word] = currWrd[0]
            currWrd[0] += 1
        # Update lineWords with word.
        lineWords.append(word)
    return lineWords
Example #5
0
def parseLyrics2(outlist):
	bandLyricInfo = {} 
	master = [['death', 0],['violence',0],['sacrifice',0],['nature',0],['peace',0],['storm',0],['spirit',0],[ 'dark',0],['scream',0],['pain',0],['blood',0],['flesh',0],['love',0],['greed',0],['poison',0],['anger',0],['revenge',0],['misery',0],['hell',0],['heaven',0],['hate',0],['soul',0],['battle',0],['ghost',0],['joy',0],['light',0],['omen',0],['miracle',0],['magic',0],['universe',0],['disease',0],['god',0],['satan',0],['struggle',0],['heart',0]]
	for key in outlist:
		templist = copy.deepcopy(master) ;
		#key = 'Queensryche'
		raw = outlist[key];
		raw = raw.lower();
		words = re.findall(r'\w+', raw,flags = re.UNICODE | re.LOCALE) # punctuation
		imp_words = filter(lambda x: x not in stopwords.words('english'), words) # filter noise
		lmt = WordNetLemmatizer()
		words_new = [lmt.lemmatize(x) for x in words]
		dw = list(set(words_new))
		
		for word in dw:
			for m in templist:
				p1 = wordnet.synsets(word) ;
				p2 = wordnet.synsets(m[0]) ;
				if(len(p1) >0 and len(p2) >0):
					c = p1[0].wup_similarity(p2[0])
					if(c > m[1]):
						m[1] = c
		# sort words according to similarity
		tnew = sorted(templist,key=lambda val:val[1],reverse=True) [0:10] ;
		# remove the other column
		for l in tnew:
			del l[1]
		print 'Done ',key
		#break ;
		bandLyricInfo[key] = tnew
		#del templist
	return bandLyricInfo
def feature_extractor(data):
    """Extract features from a relation for the classifier."""
    features = dict()
    lmtzr = WordNetLemmatizer()

    h2, h3, paragraph = data
    
    features['h2_' + h2.lower()] = True
    for word in h2.split(' '):
        if word.lower() not in stopwords.words('english') and len(word) > 1:
            features['h2word_' + word.lower()] = True
    features['h_' + h2.lower()] = True
    for word in h2.split(' '):
        if word.lower() not in stopwords.words('english') and len(word) > 1:
            features['hword_' + word.lower()] = True

    if h3 != None:    
        features['h3_' + h3.lower()] = True
        for word in h3.split(' '):
            if word.lower() not in stopwords.words('english') and len(word) > 1:
                features['h3word_' + word.lower()] = True
        features['h_' + h3.lower()] = True
        for word in h3.split(' '):
            if word.lower() not in stopwords.words('english') and len(word) > 1:
                features['hword_' + word.lower()] = True
        
    for word in nltk.wordpunct_tokenize(paragraph):
        if word.lower() not in stopwords.words('english') and len(word) > 1:
            features[word] = True
            features['lower_' + word.lower()] = True
            features['lmtzr_' + lmtzr.lemmatize(word).lower()] = True
    return features
def cleanUp(rawWords):
  
  stops = [t.lower() for t in stopwords.words('english')]

  sumarr = []
  for i in range(0,len(rawWords)):
    arr = [t.lower() for t in rawWords[i].split()]
    for word in arr: 
      if word not in stops:
        sumarr.append(word.lower())
  
  punct1 = '.,?/><";![]:@#$%&*()'
  punct2 = "'"
#  for i in range(0,len(sumarr)):
  for i in range(0,1):  
    if r'\\xe2' in sumarr[i]: sumarr[i] = '*'
    if len(sumarr[i]) > 1:
      if sumarr[i][-1] in punct1 or sumarr[i][-1] in punct2: sumarr[i] = sumarr[i][:-1]   # delete punctuation at the end of the word.
      if sumarr[i][-1] in punct1 or sumarr[i][-1] in punct2: sumarr[i] = sumarr[i][:-1]   # once more to delete double punctuations.      if sumarr[i][0] in punct1 or sumarr[i][0] in punct2: sumarr[i] = sumarr[i][1:]   # delete punctuation at the start of the word.      
    if len(sumarr[i]) > 2:
#      print sumarr
      if sumarr[i][-2] == "'" and sumarr[i][-1] == 's' : sumarr[i] = sumarr[i][:-2] # so that Jim's --> Jim.
      if sumarr[i][-2] == "'" and sumarr[i][-1] == 'm' : sumarr[i] = sumarr[i][:-2] # so that I'm --> I.
    if len(sumarr[i]) > 3:    
      if sumarr[i][-3] == 'n' and sumarr[i][-2] == "'" and sumarr[i][-1] == 't' : sumarr[i] = sumarr[i][:-3] # so that isn't --> is. Not is a stop word.
    
  lmtzr = WordNetLemmatizer()
  return [lmtzr.lemmatize(t) for t in sumarr if ("'" not in t and t not in stops)]
def wordMatch(story, ques):
    quesWords = word_tokenize(ques.ques)
    lemmatizer = WordNetLemmatizer()
    quesWords_Lemmatized = [lemmatizer.lemmatize(word) for word in quesWords]
    maxSentence = ''
    maxScore = 0
    maxSentenceWords = []
    for sent in story.sentences:
        sent = sent.replace(".", "")
        sent = sent.replace(",", "")
        sent = sent.replace("\n", " ")
        sent = sent.replace("\s", "")
        sent = sent.replace("\\", "")
        score = 0
        filteredWords_Lemmatized = story.sentLemmaWords[sent]
        postags = story.sentPosTags[sent]

        dict = {}
        propernouns = []
        referencetohuman = 'false'
        for tag in postags:
            dict[tag[0]] = tag[1]
            if 'NNP' in tag[1]:
                propernouns.append(tag[0])
            if 'NN' in tag[1]:
                referencetohuman = 'true'

        # Rule 1
        for qWord in quesWords_Lemmatized:
            if qWord in filteredWords_Lemmatized:
                if 'VB' in dict[qWord]:
                    score += 6
                    # break
                else:
                    score += 3
                    # break

        # Rule 2
        for pn in propernouns:
            if pn in quesWords_Lemmatized:  # The same noun word is present in the ques as well
                score += 6

        # Rule 3
        quesposttags = nltk.pos_tag(quesWords_Lemmatized)
        for tag in quesposttags:
            if 'NNP' in tag[1] and 'name' in filteredWords_Lemmatized:
                score += 4

        # Rule 4
        if propernouns.__len__() > 0 or referencetohuman == 'true':
            score += 4

        if score >= maxScore:
            maxScore = score
            maxSentence = sent
            maxSentenceWords = story.sentWords[sent]

    print("Answer: " + removeCommonWords(maxSentenceWords, quesWords))
    finalString = "\nAnswer: " + removeCommonWords(maxSentenceWords, quesWords) + "\n\n"
    answerFile.write(finalString)
	def convertToVec(self, line):
		lmtzr = WordNetLemmatizer()
		if isinstance(line, unicode):
			line = str(unicodedata.normalize('NFKD', line).encode('ascii','ignore'))
		#Strip of special characters
		line = re.sub(r'[^a-z^A-Z^0-9^,^.]|\^', ' ', line)
		line = line.lower()
		wordcount = {}
		count = self.Dic.count

		for word in line.split(' '):
			word = lmtzr.lemmatize(word)
			if isinstance(word, unicode):
				word = str(unicodedata.normalize('NFKD', word).encode('ascii','ignore'))
			if word in self.Dic.words.keys():
				num = self.Dic.words[word]
			else:				
				num = count
				count += 1
			if num not in wordcount.keys():
				wordcount[num] = 1
			else:
				wordcount[num] = wordcount[num] + 1
		
		vec = []
		for key in wordcount.keys():
			tp = (key, wordcount[key] + 0.0)
			vec.append(tp)
		return vec
Example #10
0
def lemmatize(tokens):
    tokenLemmas = [];
    lmtzr = WordNetLemmatizer()
    
    for items in tokens:
        tokenLemmas.append([lmtzr.lemmatize(item) for item in items])
    return tokenLemmas
Example #11
0
def MakeLemmaList(tagged):
    # n noun
    # v verb
    # a adje
    # r adverb
    # m,w,.. something else

    noun_op, adj_op, adv_op, verb_op, other_op = [], [], [], [], []

    lm = WordNetLemmatizer()
    for i in tagged:
        # print i, i[0], i[1][0:2]
        if cmp(i[1][0:1], "N") == 0:
            noun_op.append(lm.lemmatize(i[0], "n"))
        elif cmp(i[1][0:1], "V") == 0:
            asd = lm.lemmatize(i[0], "v")
            if asd != "be" and asd != "have" and asd != "do" and asd != "done" and asd != "should":
                verb_op.append(asd)
        elif cmp(i[1][0:1], "J") == 0:
            adj_op.append(lm.lemmatize(i[0], "a"))
        elif cmp(i[1][0:1], "R") == 0:
            adv_op.append(lm.lemmatize(i[0], "r"))
        else:
            # print lm.lemmatize(i[0])+ " "
            pass
    final_op = noun_op + verb_op + other_op + adj_op + adv_op
    return final_op
Example #12
0
def get_skill_for_entity(entity_name):

    lmtzr = WordNetLemmatizer()
    name = entity_name.replace('_', ' ')
    lemma = lmtzr.lemmatize(name)
    skills = Skill.objects.filter(lemma_name=lemma)
    return skills
Example #13
0
def get_cooc(chunk_trees,stoplist=True):
  triples, simple_trees = [], []
  lmtzr = WordNetLemmatizer()
  for t in chunk_trees:
    entities = []
    for chunk in t[:]:
      if isinstance(chunk,Tree) and chunk.node == 'NP':
        # getting a tree for later processing of triples from the simple noun 
        # phrases (if present)
        simple_trees.append(parser_smp.parse(chunk.leaves()))
        words = []
        for word, tag in chunk[:]:
          # stem/discard elements and construct an argument
          if (stoplist and word in STOPLIST) or \
          (len([x for x in word if x.isalnum()]) == 0):
            # do not process stopwords for simple trees, do not process purely 
            # non alphanumeric characters
            continue
          if tag.startswith('N'):
            words.append(lmtzr.lemmatize(word,'n'))
          elif tag.startswith('J'):
            words.append(lmtzr.lemmatize(word,'a'))
          else:
            words.append(word)
        if len(words) > 0:
          entities.append(SEP.join(words))
    for e1, e2 in combinations(entities,2):
      triples.append((e1,util.COOC_RELNAME,e2))
      triples.append((e2,util.COOC_RELNAME,e1))
  return triples, simple_trees
def getting_sentiment(word,pos):
    flag = 0
    if 'NN' in pos:
        tag = 'n'
    elif 'JJ' in pos:
        tag = 'a'
        if pos == 'JJS':
            flag = 1
    elif 'VB' in pos:
        tag = 'v'
    elif 'RB' in pos:
        tag = 'r'
    else:
        tag = ''
    stemmer = WordNetLemmatizer()
    if tag != '':
        x = stemmer.lemmatize(word,tag)
    else:
        x = stemmer.lemmatize(word)

    try:
        score = float(score_dic[x]) #* float(m1)
    except KeyError:
        if len(swn.senti_synsets(x,tag)) > 0:
            score = swn.senti_synsets(x,tag)[0].pos_score() * 5
        else:
            score = 100

    if flag == 1 and score != -100 and score < 4:
        score = score + 1
    elif flag == 1 and score != -100 and score > -4 and score < 0:
        score = score - 1
    print word + '--->' + str(score)
    return score
Example #15
0
def processwords(words):
        # Lemmatize the words
        print 'Lemmatizing...'
        lmtzr = WNL()
        lemmatized = [lmtzr.lemmatize(w) for w in words ]
        print len(lemmatized)
        # Create a dictionary of the words and the counts
        # Place words in a Counter collection object (this removes duplicates and counts the occurences of a word)
        print 'Mapping words to counts...'
        word_dict = Counter(lemmatized)
        print len(word_dict)
        # Drop out words that occur less than 100 times in the entire set of webpages
        print 'Removing words that appear less than 100 times...'
	for key, count in dropwhile(lambda key_count: key_count[1] >= 100, word_dict.most_common()):
                del word_dict[key]
	print len(word_dict)
        # Filter the words of stopwords (too common), non-English words, and single-letter words
        print 'Filtering out stopwords, non-English words, and single-lettered words...'
        for w in list(word_dict):
                if w in stopwords.words('english'):
                        del word_dict[w]
                elif not wordnet.synsets(w):
                        del word_dict[w]
                elif len(w)==1:
                        del word_dict[w]
        print len(word_dict)
        return word_dict
Example #16
0
def get_bag_of_senses(temp_words1):
    senses = []
    lmtzr = WordNetLemmatizer()
    temp_words1 = nltk.pos_tag(temp_words1.split())
    
    for t in temp_words1:
        try:
            if 'VB' in t[1]:
                senses.append(wordnet.synsets(lmtzr.lemmatize(t[0],'v')))
            else:
                senses.append(wordnet.synsets(t[0]))
        except:
            pass
        
    hypernyms = []
    for sense_l in senses:
        for s in sense_l:
            hypernyms.append(s.hypernyms())
    
    hyponyms = []
    for sense_l in senses:
        for s in sense_l:
            hyponyms.append(s.hyponyms())
            
    '''meronyms = []
    for sense_l in senses:
        for s in sense_l:
            meronyms.append(s.part_meronyms())        
    
    toponyms = []
    for sense_l in senses:
        for s in sense_l:
            toponyms.append(s.part_holonyms())'''
    
    definitions = []
    for sense_l in senses:
        if len(sense_l) > 1:
            for s in sense_l:
                definitions.append(s.definition)
    
    for sense_l in hypernyms:
        if len(sense_l) > 1:
            for s in sense_l:
                definitions.append(s.definition)
   
    for sense_l in hyponyms:
        if len(sense_l) > 1:
            for s in sense_l:
                definitions.append(s.definition)
    
    '''for sense_l in meronyms:
        for s in sense_l:
            definitions.append(s.name)
            
    for sense_l in toponyms:
        for s in sense_l:
            definitions.append(s.name)'''
                    
    definitions = ' '.join(definitions)
    return definitions
    def run(self):
        """
        How do I run this Task?
        Luigi will call this method if the Task needs to be run.
        """
        # remove stop words and punctuation
        stop = set(stopwords.words('english'))
        tokenizer = RegexpTokenizer(r'\w+')
        wordnet = WordNetLemmatizer()

        docs = []

        #ipdb.set_trace()

        for f in self.input(): # The input() method is a wrapper around requires() that returns Target objects
            lines = 0
            words = []

            for line in f.open('r'):
                if lines == 0:
                    label = line
                    lines +=1
                else:
                    words.extend(tokenizer.tokenize(line))
                    lines +=1

            words_filtered = filtered_words = [wordnet.lemmatize(w) for w in words if not w in stopwords.words('english')]
            docs.append((label, '\t'.join(words)))

        out = self.output().open('w')
        for label, tokens in docs:
            out.write("%s,%s\n" % (label.strip(), tokens.strip()))
        out.close()
Example #18
0
def lemmat(str):

  lemm = WordNetLemmatizer()
  split = str.split(' ')
  index = 1
  new = []
  new1  = []
  dict = {}
  new1.append(['',0,0])
  pom2 = 0
  for word in split:
    item = []
    new.append(word)
    if word != '':
      pom = lemm.lemmatize(word,'n')
      item.append(pom)
      item.append(pom2 + new1[-1][1])
      dict[index]= item
      item.append(index)
      new1.append(item)
      index += len(pom) + 1
      pom2 = len(word) - len(pom)
    else:
      pom2 += 1
  outp = ''
  for i in new1:
    outp += i[0] + ' '
  outp = outp[1:]
  result = []
  result.append(outp)
  result.append(dict)
  return result
Example #19
0
File: cleaner.py Project: j2kun/svd
def process():
    print("Loading...")
    documentDict = loadRaw('data/cnn-stories')
    documents = []

    print("Cleaning...")
    i = 0
    for filename, documentText in documentDict.items():
        tokens = tokenize(documentText)
        tagged_tokens = pos_tag(tokens)
        wnl = WordNetLemmatizer()
        stemmedTokens = [wnl.lemmatize(word, wordnetPos(tag)).lower()
                         for word, tag in tagged_tokens]

        documents.append({
            'filename': filename,
            'text': documentText,
            'words': stemmedTokens,
        })
        if i % 100 == 0:
            print(i)
        i += 1

    print("Writing to disk...")
    with open('all_stories.json', 'w') as outfile:
        outfile.write(json.dumps(documents))

    print("Done!")
Example #20
0
def lemmatize(w,p):
    if p.startswith("N"):
        return (wnl.lemmatize(wnl,w,'n'),p)
    elif p.startswith("V"):
        return (wnl.lemmatize(wnl,w,'v'),p)
    else:
        return (w,p)
Example #21
0
File: svm.py Project: Chunpai/cs200
def convert_speeches_into_matrix(features,speech_list,label):    
    sample_matrix = []
    label_vector  = []
    #print len(features)
    for speech in speech_list:
        sample = []
        speech = re.sub('http://[a-zA-Z0-9|/|.]*',' ',speech)
        speech = re.sub('%[0-9|.]*', ' ', speech)
        speech = re.sub('$[0-9|.]*',' ', speech)
        for ch in " \"$!'@#%&()*+,-./:;<=>?[\\]^_`{|}~ ":
            speech = speech.replace(ch,' ')

        tokens = speech.split()
        
        #word lemmatization
        lmtzr = WordNetLemmatizer()
        tokens = [lmtzr.lemmatize(token) for token in tokens]
        tokens = [lmtzr.lemmatize(token,'v') for token in tokens]

        #tokens = bigrams(tokens)                    # uncomment this line, we can use bigram as
        unique_tokens_dict = collections.Counter(tokens)

        for fea in features:
            if fea in unique_tokens_dict:
                sample.append(unique_tokens_dict[fea])
            else:
                sample.append(0)
       
        #print(sample)
        sample_matrix.append(sample)
        label_vector.append(label)
    
    return sample_matrix,label_vector
Example #22
0
def lemma_tokenize(paragraph):
    lmtzr = WordNetLemmatizer()
    try:
        return [lmtzr.lemmatize(word).lower() for sentence in tokenize(paragraph) for word in sentence]
    except LookupError:
        nltk.download('wordnet')
        return [lmtzr.lemmatize(word).lower() for sentence in tokenize(paragraph) for word in sentence]
Example #23
0
 def wordLemmatization(self):
     #should be working now
     lemmatizer = WordNetLemmatizer()
     lemmatization_result = []
     for word in self.file:
         lemmatization_result.append(lemmatizer.lemmatize(word))
     self.file=lemmatization_result
Example #24
0
def main():

    rake=RAKE.Rake('SmartStoplist.txt')
    fp=open(input_file,'r')
    text=fp.read()
    text=text_clean(text)
    wnl=WordNetLemmatizer()
    text=' '.join([wnl.lemmatize(i.strip()) for i in nltk.word_tokenize(text)])
    keywords=rake.run(text)
    #print keywords
    #key_list=list()
    with open(key_score_file,'wb') as out:
        csv_out=csv.writer(out)
        csv_out.writerow(['KEYWORD','SCORE'])
        for row in keywords:
            #csv_out.writerow(row)
            if row[1]>0:
                csv_out.writerow(row)
    unibitrigram_list=[]
    unibitrigram_list=generate_unibitrigrams(key_score_file)


    ngram_freq=Counter(unibitrigram_list)
    sorted_ngram_freq=sorted(ngram_freq.items(),key=lambda x:x[1],reverse=True )
    print ngram_freq
    with open('bcom_ngramfr.csv','wb') as nf_csv:
        csv_wr=csv.writer(nf_csv)
        for item in sorted_ngram_freq:
            if ((item[0]!='' or item[1]>0 )):
                csv_wr.writerow(item)
def clean_single_word(word, lemmatizing="wordnet"):
    """
    Performs stemming or lemmatizing on a single word.

    If we are to search for a word in a clean bag-of-words, we need to search it after the same kind of preprocessing.

    Inputs: - word: A string containing the source word.
            - lemmatizing: A string containing one of the following: "porter", "snowball" or "wordnet".

    Output: - lemma: The resulting clean lemma or stem.
    """
    if lemmatizing == "porter":
        porter = PorterStemmer()
        lemma = porter.stem(word)
    elif lemmatizing == "snowball":
        snowball = SnowballStemmer('english')
        lemma = snowball.stem(word)
    elif lemmatizing == "wordnet":
        wordnet = WordNetLemmatizer()
        lemma = wordnet.lemmatize(word)
    else:
        print("Invalid lemmatizer argument.")
        raise RuntimeError

    return lemma
def stemWordMatch(question,sentence):

    lmtzr = WordNetLemmatizer()

    question_tokens = set(nltk.word_tokenize(question))
    sentence_tokens=set(nltk.word_tokenize(sentence))

    count=0
    '''for i in sentence_tokens:
        #Finding the exact word match
        if lmtzr.lemmatize(i, 'v').lower() in  [lmtzr.lemmatize(x, 'v').lower() for x in question_tokens]:
            #print  'matching word is:',i
            count=count+6
        elif i.lower() in [x.lower() for x in question_tokens]:
            print 'i is :',i
            count=count+3
    #print 'Exact word match count is :',count'''

    for i in sentence_tokens:
        #Finding the exact word match

        if i.lower() in [x.lower() for x in question_tokens]:
            #print 'i is :',i
            count=count+3
        elif lmtzr.lemmatize(i, 'v').lower() in  [lmtzr.lemmatize(x, 'v').lower() for x in question_tokens]:
            #print  'matching word is:',i
            count=count+6

    #print 'Exact word match count is :',count


    return count
Example #27
0
def data_preprocessing(file_path):
    f = open(file_path,'r')
    speech_list = f.read().split("###")   # read speeches, split with ###, and save them into list.
    del speech_list[-1]
    f.close()
    #print len(speech_list)
    f = open(file_path,'r')
    speeches = f.read().lower()    #set all letters lower case
    speeches = re.sub('http://[a-zA-Z0-9|/|.]*',' ',speeches)
    speeches = re.sub('%[0-9|.]*', ' ', speeches)
    speeches = re.sub('$[0-9|.]*',' ', speeches)
    #speeches = re.sub('\\\\xe2\\\\x80\\\\x[a-zA-Z0-9]*',' ',speeches)
    #print speeches
    for ch in " \"$!'@#%&()*+,-./:;<=>?[\\]^_`{|}~ ":
        speeches = speeches.replace(ch,' ')

    tokens = speeches.split()
    
    #word lemmatization
    lmtzr = WordNetLemmatizer()
    tokens = [lmtzr.lemmatize(token) for token in tokens]
    tokens = [lmtzr.lemmatize(token,'v') for token in tokens]

    #tokens = bigrams(tokens)                    # uncomment this line, we can use bigram as

    total_tokens_count = len(tokens)
    unique_tokens_dict = collections.Counter(tokens)   #key is word, value is the count,
                                                       #also default value 0 for non-exsit key.

    result = [ speech_list, unique_tokens_dict, total_tokens_count ]
    return result
Example #28
0
    def __tokenize(self,text):
        """ function: tokenize
            ------------------
            generate list of tokens given a block of @text

            :param text: string representing article text field
            :returns: list of tokens with various modifications
        """
        ascii = text.encode('ascii', 'ignore')
        # remove digits & punctuation
        no_digits = ascii.translate(None, string.digits)
        no_punctuation = no_digits.translate(None, string.punctuation)
        # separate text blocks into tokens
        tokens = nltk.word_tokenize(no_punctuation)
        # remove class labels, stopwords, and non-english words
        no_class_labels = [w for w in tokens if not w in Document.banned_words]
        no_stop_words = [w for w in no_class_labels if not w in stopwords.words('english')]
        eng = [y for y in no_stop_words if wordnet.synsets(y)]
        # lemmatization
        lemmas = []
        lmtzr = WordNetLemmatizer()
        for token in eng:
           lemmas.append(lmtzr.lemmatize(token))
        # stemming
        stems = []
        stemmer = PorterStemmer()
        for token in lemmas:
            stem = stemmer.stem(token).encode('ascii', 'ignore')
            if len(stem) >= 4:
                stems.append(stem)
        return stems
Example #29
0
def weed_out_lexelts(tweets_file):
    lexelts = []
    WNL = WordNetLemmatizer()
    with open(tweets_file, 'r') as twh:
        for line in twh:    
            line = line.strip().split(' :: ')[1]
            lexelts_temp = []
            try:
                lexelts_temp = pos_tag(word_tokenize(line))
            except TypeError:
                print line

            # Get sanitized parts of speech, not the Treebank style
            # Tuples are immutable, need to make a new single-tuple list 
            for w, p in lexelts_temp:
                new_p = get_sanitized_pos(p)
                new_w = w
                try: 
                    new_w = WNL.lemmatize(w, new_p)
                except KeyError:
                    pass
                lexelts.extend([(new_w, new_p)])
    
    lexelts = list(set(lexelts))
    print lexelts
    return lexelts
Example #30
0
def getlemmas(tokens):
    lemmas = []
    for token in tokens:
        if len(token) < 2 or not isWord(token) or token == "the":
            lemmas.append({})
            continue
        
        tokenLemmas = {}
        #Synonyms
        for syn in wn.synsets(token):
            #Derived Forms and their Syns
            for lemma in syn.lemmas():
                for df in lemma.derivationally_related_forms():
                    for ln in df.synset().lemma_names():
                        tokenLemmas[ln] = 4
                    tokenLemmas[df.name()] = 3
            for lname in syn.lemma_names():
                tokenLemmas[lname] = 2
        
        #Wordnet lemmas
        l = WordNetLemmatizer()
        for x in ('v','a','s','r','n'):
            tmp = l.lemmatize(token, x)
            tokenLemmas[tmp] = 1
            tmp = l.lemmatize(tmp, x)
            tokenLemmas[tmp] = 1
        
        #Exact
        tokenLemmas[token] = 1
        
        lemmas.append(tokenLemmas)
    
    return lemmas
from gensim.models import Word2Vec

from pprint import pprint

google = gensim.models.KeyedVectors.load_word2vec_format(
    '~/word2vec-model/GoogleNews-vectors-negative300.bin', binary=True)

with open('data/requirements.txt', 'r') as myfile:
    data = myfile.read().replace('\n', ' ')

#stemmer = PorterStemmer()
#stemmed_text1 = [stemmer.stem(i) for i in word_tokenize(data1)]
#s1 = ' '.join(stemmed_text1)
#print 'Stemmed text1: %s \n\n\n' % s1

lemma = WordNetLemmatizer()
#lemma_text = [lemma.lemmatize(i, pos="n") for i in word_tokenize(data1)]

# Remove stopwords
stops = set(stopwords.words("english"))
#lemma_filtered = [word for word in lemma_text if word not in stops]

#ls1 = ' '. join(lemma_text)

#print 'Lemma text1: %s \n\n\n' % ls1

#with open("data/ls1.txt", 'w') as f:
#  f.write(ls1)

#print 'Text1 %s' % string.join(stemmed_text1, " ")
Example #32
0
from gensim import corpora, models, similarities
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.corpus import stopwords
from nltk import pos_tag
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.stem.snowball import SnowballStemmer
import re
from collections import defaultdict


lemmatizer = WordNetLemmatizer()
snb = SnowballStemmer('english')


def remove_punctuation(text):
    return re.sub(ur"\p{P}+", "", text)


def stem_with_replacement(texts):
    '''
    attempts to find a common form of each word. keeps the shortest full word for each stem rather than nonsensible root. common form is taken from the entire corpus rather than just the single document
    '''
    stem_dict = defaultdict(set)
    unstem = lambda x: min(stem_dict[x], key=len)

    words = word_tokenize(' '.join(texts).lower())
    for word in words:
        stemmed = snb.stem(word)
        stem_dict[stemmed].add(word)

    new_texts = []
Example #33
0
         'auguments', 'get', 'string', 'prototype', 'nodeType', 'slice', 'header', 'top', 'li',
         'style', 'Appendix','Table', 'owl', 'hover', 'pageination']
for i in extra:
    stop_words.append(i)

keywords = [word for word in tokens if not word in stop_words and not word in punctuations]

#nltk.download('wordnet') 
from nltk.stem.wordnet import WordNetLemmatizer

##Convert to list from string
text = text.split()
    
    
#Lemmatisation
lem = WordNetLemmatizer()
text = [lem.lemmatize(word) for word in text if not word in stop_words] 
text = " ".join(text)

corpus = []
corpus.append(text)

#Word cloud
from os import path
from PIL import Image
from wordcloud import WordCloud, STOPWORDS, ImageColorGenerator
import matplotlib.pyplot as plt

wordcloud = WordCloud(
                          background_color='white',
                          stopwords=stop_words,
def success(text):
    data = {}
    data[0] = text
    pd.set_option('max_colwidth', 200)
    df1 = pd.DataFrame.from_dict(data, orient='index')
    df1.columns = ['Lyrics']

    def round1(text):
        # lower the Text
        text = text.lower()
        # Remove Numbers
        text = re.sub(r"\d+", "", text)
        # Remove Symbols and special characters
        # Below return true if not alphanumereic
        text = re.sub(r'[^\w]', ' ', text)
        # Remove more than a single whitespace
        text = ' '.join(text.split())
        # Remove Leading and Trailing Whitespaces
        text = text.strip()
        return text

    rnd1 = lambda x: round1(x)
    df2 = df1.copy()
    df2['Lyrics'] = df2['Lyrics'].apply(rnd1)

    stop = list(string.punctuation)

    def cleaning(text):
        clean_doc = []
        for x in text:
            clean_sent = []
            for i in word_tokenize(x):
                # for i in x.lower():
                if i not in stop:
                    clean_sent.append(i)
            clean_doc.append(clean_sent)
        return clean_doc

    df3 = df2.copy()
    df3['Lyrics'] = cleaning(df3['Lyrics'])

    s = ' '
    for i in range(len(df3)):
        df3['Lyrics'].loc[i] = s.join(df3['Lyrics'].loc[i])

    wordnet = WordNetLemmatizer()

    def Lemmatizing(text):
        pre_doc = []
        for word in text:
            pre_doc.append(wordnet.lemmatize(word))
        return pre_doc

    df4 = df3.copy()
    df4['Lyrics'] = Lemmatizing(df4['Lyrics'])

    cv = CountVectorizer(stop_words='english')
    df5 = cv.fit_transform(df4['Lyrics'])
    df6 = pd.DataFrame(df5.toarray(), columns=cv.get_feature_names())
    df6.index = df4.index

    df7 = df6.transpose()

    top_dict = {}
    for c in df7.columns:
        top = df7[c].sort_values(ascending=False).head(30)
        top_dict[c] = list(zip(top.index, top.values))

    for album, top_words in top_dict.items():
        print(album)
        print(', '.join([word for word, count in top_words]))
        print('------------')

    ts = Translator()
    res = ts.translate(df4['Lyrics'].loc[0], dest='hi')
    hitext = res.text
    return '%s' % hitext
    temptext = re.sub('[^a-zA-Z]', ' ', str(content))
    temptext = temptext.lower()
    tokens = nltk.word_tokenize(temptext)
    #tokens = [word for word in tokens if word not in set(builtinstopwords)] 
    cleanbody= [lm.lemmatize(word) for word in tokens if not word in set(builtinstopwords)]
    return (str(cleanbody)[1:-1])

def exec_time(start, end):
    if (end - start) <= 60:
        print("Total Execution time was {} seconds".format(end - start))
    else:
        print("Total Execution time was {} minutes".format((end - start)/60))

#%% 
# TODO - check the difference in time when lemmatizer is instantiated inside the clean function vs outside
lm = WordNetLemmatizer()
df['cleaned']=df['text'].apply(lambda x : clean(x))
df.reset_index(drop=True,inplace=True)

#%%
# FIT THE TFIDF VECTORIZER AND PICKLE THE VOCAB

#tfidf_obj = TfidfVectorizer(max_df=0.5,min_df=0.01,use_idf=True)
tfidf_obj = TfidfVectorizer(max_features = 5000)
X_train_tfidf = tfidf_obj.fit_transform(df.cleaned)
# X_train_tfidf.shape

# feature_list = tfidf_obj.vocabulary_
# feature_list

#%%
Example #36
0
 def __init__(self):
     self.wordnet_lemmatizer = WordNetLemmatizer()
     self.mapping = tagset_mapping('en-ptb', 'universal')
#exit()
Roles_Entities = Extract_Roles_entities(processed_input, CC_resolve_pos_dict,
                                        conjunction_index)
print("\n\n")
print("processed_information: ", processed_information)
print("\n\n")

print("schema Identification.....")
possible_schemas = []
possible_schemas_sent_index = []
for schema, values in schemas_keys.items():
    for ind, sent in enumerate(processed_input):
        sent_tokens = nltk.word_tokenize(sent)
        for w in sent_tokens:
            word_lemmatization = WordNetLemmatizer().lemmatize(w, 'v')
            if (word_lemmatization in values):
                possible_schemas.append(schema)  # Schema and sent's index
                possible_schemas_sent_index.append(ind)

print("\n\n")
print("possible_schemas and their sent's indices: ", possible_schemas,
      possible_schemas_sent_index)
print("\n")

print("Identified Unique Schemas: ", set(possible_schemas))
Unique_Schemas = set(possible_schemas)
print("\n")

for pred in Unique_Schemas:
    #print("Test sample",i+1,pred[i])
Example #38
0
pat_s2 = re.compile("(?<=s)\'s?")
# to find the abbreviation of not
pat_not = re.compile("(?<=[a-zA-Z])n\'t")
# to find the abbreviation of would
pat_would = re.compile("(?<=[a-zA-Z])\'d")
# to find the abbreviation of will
pat_will = re.compile("(?<=[a-zA-Z])\'ll")
# to find the abbreviation of am
pat_am = re.compile("(?<=[I|i])\'m")
# to find the abbreviation of are
pat_are = re.compile("(?<=[a-zA-Z])\'re")
# to find the abbreviation of have
pat_ve = re.compile("(?<=[a-zA-Z])\'ve")


lmtzr = WordNetLemmatizer()


def get_words(file):  
    with open (file) as f:  
        words_box=[]
        pat = re.compile(r'[^a-zA-Z \']+')
        for line in f:                           
            #if re.match(r'[a-zA-Z]*',line): 
            #    words_box.extend(line.strip().strip('\'\"\.,').lower().split())
            # words_box.extend(pat.sub(' ', line).strip().lower().split())
            words_box.extend(merge(replace_abbreviations(line).split()))
    return collections.Counter(words_box)  


def merge(words):
Example #39
0

def get_wordnet_pos(treebank_tag):
    if treebank_tag.startswith('J'):
        return wordnet.ADJ
    elif treebank_tag.startswith('V'):
        return wordnet.VERB
    elif treebank_tag.startswith('N'):
        return wordnet.NOUN
    elif treebank_tag.startswith('R'):
        return wordnet.ADV
    else:
        return ''


lemmatizer = WordNetLemmatizer()

filename = "../resources/semeval/train/english-lexical-sample.train.xml"
output_dir = "../resources/semeval/lexelts"

lemmas = []
tree = etree.parse(filename)

for lexelt_idx, lexelt in enumerate(tree.findall("lexelt")):
    lexelt_lemma = lexelt.attrib['item']
    lemmas.append(lexelt_lemma)

    print "Analysing {} (number {})".format(lexelt_lemma, lexelt_idx)

    sentences = []
Example #40
0
def get_answer(question, story):
    """
    :param question: dict
    :param story: dict
    :return: str


    question is a dictionary with keys:
        dep -- A list of dependency graphs for the question sentence.
        par -- A list of constituency parses for the question sentence.
        text -- The raw text of story.
        sid --  The story id.
        difficulty -- easy, medium, or hard
        type -- whether you need to use the 'sch' or 'story' versions
                of the .
        qid  --  The id of the question.


    story is a dictionary with keys:
        story_dep -- list of dependency graphs for each sentence of
                    the story version.
        sch_dep -- list of dependency graphs for each sentence of
                    the sch version.
        sch_par -- list of constituency parses for each sentence of
                    the sch version.
        story_par -- list of constituency parses for each sentence of
                    the story version.
        sch --  the raw text for the sch version.
        text -- the raw text for the story version.
        sid --  the story id


    """
    ###     Your Code Goes Here         ###
    # Our tools

    stemmer = SnowballStemmer("english")
    chunker = nltk.RegexpParser(GRAMMAR)
    lmtzr = WordNetLemmatizer()

    driver = QABase()

    # question["qid"] returns the form: "fables-04-7"
    q = driver.get_question(question["qid"])
    current_story = driver.get_story(q["sid"])
    text = story["text"]

    # Apply the standard NLP pipeline we've seen before
    sentences = get_sentences(text)

    # tokenize questions, also removing punctuations to extract keywords
    tokenizer = RegexpTokenizer(r'\w+')
    tokenized_question_text = tokenizer.tokenize(question["text"])
    tagged_tokenized_question_text = nltk.pos_tag(tokenized_question_text)

    # remove stopwords
    tagged_keywords_list = []
    stopwords = set(nltk.corpus.stopwords.words("english"))
    for word, tag in tagged_tokenized_question_text:
        if word not in stopwords:
            tagged_keywords_list.append((word, tag))

    # lemmatize keywords
    lemmatized_keywords_list = []
    for keyword, tag in tagged_keywords_list:
        lemmatized_keywords_list.append(stemmer.stem(keyword))

    # Find the sentences that have all of our keywords in them
    target_sentences = find_sentences(lemmatized_keywords_list, sentences)
    # Extract the candidate locations from these sentences
    candidates_forest = find_candidates(target_sentences, chunker,
                                        question["text"])

    if (question["difficulty"] == 'Easy' and len(candidates_forest) != 0):

        possible_answers_list = []

        # locations is a list of trees
        for candidate in candidates_forest:
            # candidate.draw()
            possible_answers_list.append(" ".join(
                [token[0] for token in candidate.leaves()]))
        answer = " ".join(possible_answers_list)

        ###########################################
        # currently, possible_answer contains the actual needed answer,
        # plus some garbage words around it from chunking,
        # we might be able to filter this out SOMEHOW
        # possible_answer is a list of strings
        ###########################################

    elif question["difficulty"] == 'Medium':

        if question["type"] != 'Story':
            sentences = get_sentences(current_story["sch"])
        else:
            sentences = get_sentences(current_story["text"])

        Q = nltk.word_tokenize(question["text"].lower())
        # print(Q)

        all_stemmed_sentences = []
        for sent in sentences:
            temp_sent = []
            for w, pos in sent:
                temp_sent.append((stemmer.stem(w), pos))
            all_stemmed_sentences.append(temp_sent)
        stop_words = set(nltk.corpus.stopwords.words("english"))
        qbow = get_bow(get_sentences(question["text"])[0], stopwords)
        stemmed_qbow = []
        for w in qbow:
            stemmed_qbow.append(stemmer.stem(w))
        stemmed_qbow = set(stemmed_qbow)
        best_idx = best_overlap_index(stemmed_qbow, all_stemmed_sentences,
                                      stop_words)
        # print(question["qid"], best_idx)

        if question["type"] != 'Story':
            tree = current_story["sch_par"][best_idx]
        else:
            tree = current_story["story_par"][best_idx]

        #############################################
        # if question["qid"] == 'blogs-03-13':
        #     print(Q)
        #     print(tree)
        #     print("++++++++++++++++++++++++++++++++++++++++++++++")
        ############################################
        # print(tree)
        # Create our pattern

        # First level subtree matching
        # candidate_sents = []
        #
        # for sub in tree:
        #     subsent = " ".join(sub.leaves())
        #     candidate_sents.append(subsent)
        #
        # stemmed_candidate_sents = []
        # for s in candidate_sents:
        #     temp_candidate_sents = []
        #     s = nltk.word_tokenize(s)
        #     s = nltk.pos_tag(s)
        #
        #     for w, p in s:
        #         temp_candidate_sents.append((stemmer.stem(w), p))
        #     stemmed_candidate_sents.append(temp_candidate_sents)
        #
        # best_idx = best_overlap_index(stemmed_qbow, stemmed_candidate_sents, stopwords)
        # tree = tree[best_idx]
        # if question["qid"] == 'mc500.train.18.18':
        #     print(tree)

        #########################################
        # MAKE PATTERN FIT FOR TYPE OF QUESTION #
        #########################################
        # print(Q[0])
        if Q[0] == 'where' or Q[0] == 'when':
            pattern = nltk.ParentedTree.fromstring("(VP (*) (PP))")
        elif Q[0] == 'who':
            pattern = nltk.ParentedTree.fromstring("(NP)")
        elif Q[0] == 'what':
            pattern = nltk.ParentedTree.fromstring("(NP)")
        elif Q[0] == 'why':
            pattern = nltk.ParentedTree.fromstring("(SBAR)")
        elif Q[0] == 'how':
            pattern = nltk.ParentedTree.fromstring("(RB)")

        # don't know how to deal with 'did' questions
        elif Q[0] == 'did':
            pattern = nltk.ParentedTree.fromstring("(S)")

        subtree1 = pattern_matcher(pattern, tree)

        ############################################
        # if question["qid"] == 'blogs-03-13':
        #     print("subtree1")
        #     print(subtree1)
        ############################################
        if subtree1 == None:
            #######################################
            answer = doBaseline(question, story)
            # answer = "doBaseline"
            #######################################
        else:
            # create a new pattern to match a smaller subset of subtrees
            if Q[0] == 'where' or Q[0] == 'when':
                pattern = nltk.ParentedTree.fromstring("(VP)")
            elif Q[0] == 'who':
                pattern = nltk.ParentedTree.fromstring("(NP)")
            elif Q[0] == 'what':
                pattern = nltk.ParentedTree.fromstring("(NP)")
            elif Q[0] == 'why':
                pattern = nltk.ParentedTree.fromstring("(SBAR (IN) (S))")
            elif Q[0] == 'how':
                pattern = nltk.ParentedTree.fromstring("(RB)")

            # don't know how to deal with 'did' questions
            elif Q[0] == 'did':
                pattern = nltk.ParentedTree.fromstring("(S)")

            # Find and make the answer
            # print(subtree)
            subtree2 = pattern_matcher(pattern, subtree1)
            answer = " ".join(subtree2.leaves())

            ############################################
            # if question["qid"] == 'mc500.train.18.18':
            #     print("subtree2")
            #     print(subtree2)
            ############################################
            # cheat for dealing with 'did' questions
            if Q[0] == 'did':
                answer = "yes"

    else:
        #########################################
        answer = doBaseline(question, story)
        # answer = "doBaseline"
        #########################################

    ###     End of Your Code         ###
    return answer
Example #41
0
#            if ele[6] == '331786748':
#                s = ele[1] + " " + ele[2]
#                print(s,file=tfile)

docs = []
with open(
        '/Users/shrey/AnacondaProjects/Application_reviews/Experiments/CNNnouns/RawData/CNNnews.txt',
        'r') as tfile:
    for line in tfile:
        docs.append(line)

tokenizer = RegexpTokenizer(r'\w+')
en_stop = get_stop_words('en')
en_stop.extend(['app', 'cnn', 'news'])
#p_stemmer = PorterStemmer()
lemma = WordNetLemmatizer()

texts = []

# loop through document list
for i in docs:

    # clean and tokenize document string
    raw = i.lower()
    tokens = tokenizer.tokenize(raw)
    # remove stop words from tokens
    stopped_tokens = [i for i in tokens if not i in en_stop]

    tagged = nltk.pos_tag(stopped_tokens)
    nouns = [i[0] for i in tagged if i[1][0] == 'N']
    # stem tokens
# Converting symptoms scraper to the corresponding root word ( Precalculation )

import re
file = open('symptoms.txt')
data = file.read()

arr = data.split('\n')
#print arr
lls = list()
for line in arr:
    ls = list()
    ls.extend(re.findall(r"[\w']+", line))
    lls.append((list(ls[:])))

from nltk.stem.wordnet import WordNetLemmatizer
lmtzr = WordNetLemmatizer()
nlls = list()
for il in lls:
    nls = list()
    for tmp in il:
        word = tmp.lower()
        temp_n = lmtzr.lemmatize(word)
        temp_v = lmtzr.lemmatize(word, 'v')
        if (temp_n != word) and (temp_v != word):
            nls.append(str(temp_v))
        elif (temp_n == word):
            nls.append(str(temp_v))
        else:
            nls.append(str(temp_n))
    nlls.append((list(nls[:])))
    #print nls
Example #43
0
from nltk.stem.lancaster import LancasterStemmer

st = LancasterStemmer()

from nltk.stem import PorterStemmer

pt = PorterStemmer()

from nltk.stem.snowball import EnglishStemmer

sb = EnglishStemmer()

from nltk.stem.wordnet import WordNetLemmatizer

wn = WordNetLemmatizer()

##let's examine the word ``better"
st.stem('better')
pt.stem('better')
sb.stem('better')
wn.lemmatize('better', 'a')

wn.lemmatize('families', 'n')

##
##applying the porter stemmer to the gettysburg address

text_5 = map(pt.stem, text_4)

##now creating a dictionary that will count the occurrence of the words
def clean_tokens(df):
    list_of_artists = list(
        set([
            item.lower() for it in
            [nltk.word_tokenize(art) for art in list(dff['artists'])]
            for item in it
        ]))
    ff = []
    hh = []
    lmtzr = WordNetLemmatizer()
    stopwords = list(
        set(
            nltk.corpus.stopwords.words('english') + ['ap'] + ['i'] +
            ["y'all"] + ['m.'] + ['mme'] + ['donot'] + ['rah'] + ['&'] +
            ['de'] + ['b'] + ['ca'] + ['of'] + ['us'] + ['the'] + ['at'] +
            ["in"] + ['and'] + ['be'] + ['it'] + ['what'] + ['sv'] + ['lo'] +
            ['d'] + ['n'] + ['spotify'] + ['record'] + ['studios'] +
            ['chorus'] + ['verse'] + ['intro'] + ['outro']))
    for sent in df:
        #print(sent)
        for token in sent:
            tt = token.replace("'s", " ").replace("n't", "not").replace('-','').replace("'ll", "will").replace('my—','my').\
                replace("'cross",'across').replace("'ve",'have').replace("'bout","about").replace("'m","am").replace("'d","would").replace("'re",'are').\
                replace('wantt','want').replace('mr.','mister').replace('ms.','miss').replace('murda','murder').replace('like-','like').replace('smallz','small')
            tt = re.sub(r'^([0-9]|[0-9][0-9]|[0-9][0-9][0-9])$', ' ',
                        tt)  #remove numbers
            tt = tt.lower()
            if tt == 'wo':
                tt = 'would'
            elif tt == 'gon':
                tt = 'going'
            elif tt == 'wan':
                tt = 'want'
            elif tt == 'na' or tt == "ta":
                tt = 'to'
            elif tt == 'ya':
                tt = 'you'
            elif tt == 'lil':
                tt = 'little'
            elif tt == 'ain':
                tt = 'am'
            elif tt == "'em" or tt == "em":
                tt = 'them'
            elif tt == 'cause' or tt == "'cause":
                tt = 'because'
            elif tt == 't':
                tt = 'not'
            elif tt == 'till' or tt == "'till" or tt == "'til" or tt == "til":
                tt = 'until'
            elif tt.endswith('—') == True:
                tt = tt.split('—')[0]
            elif tt == 'hol':
                tt = 'hold'
            elif tt == 'l':
                tt = 'lost'
            elif tt == 'cali':
                tt = 'california'
            tt = tt.split('_')

            if len(tt) == 1:
                if tt[0] not in [
                        '[]', '[:’', ':', '[', ']', '?', ',', ')', '(', ' ',
                        ';', '—', '!', "'", '’', '.', '"', "...", '“', '”',
                        "”", 'mme', "''", '``', "''", 'si', 'vv', 'c', '”',
                        'ii', '+', '$'
                ] and tt[0] not in stopwords and tt[0] not in list_of_artists:
                    ff.append(tt[0])
            else:
                for t in tt:
                    if t not in [
                            '[]', '[:’', ':', '[', ']', '?', ',', ')', '(',
                            ' ', ';', '—', '!', "'", '’', '.', '"', "...", '“',
                            '”' + 'mme', "''", '``', "''", 'si', 'vv', 'c',
                            '”', 'ii', '+', '$'
                    ] and t not in stopwords and t not in list_of_artists:
                        ff.append(t)

        lemmas = [lmtzr.lemmatize(xt, 'v') for xt in ff]
        hh.append(lemmas)
        ff = []
    return hh
Example #45
0
import nltk
# nltk.download()               # Just for one time downloading with GUI
# nltk.download("stopwords")    # Just for one time downloading with command


# https://www.nltk.org/book/
# 2. Text Pre-processing : Lexicon Normalization
# Stemming
from nltk.stem.porter import PorterStemmer
stem = PorterStemmer()
word = "races"
print(stem.stem(word))

# Lemmatization
from nltk.stem.wordnet import WordNetLemmatizer
lem = WordNetLemmatizer()
word = "playing"
print(lem.lemmatize(word, "v"))


# 2. Text Pre-processing : Object Standardization
dictionary = {
                        "brb": "be right back",
                        "cb": "call back",
                        "awsm": "awesome",
                        "lol": "laugh out loud"
                    }

def objectStandardization(text):
    words = text.split()
    substitutedWords = []
Example #46
0
from nltk.stem.wordnet import WordNetLemmatizer
import string
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
#word_tokenize accepts a string as an input, not a file.
wordlist = ["one", "im", "would", "also", "ive", "lol"]
stop_words = set(stopwords.words('english'))
file1 = open("C:\TuDiabetes_Code\Diabetes_Text_New\All.txt")
line = file1.read()  # Use this to read file content as a stream:
words = line.split()
for r in words:
    if not r in stop_words:
        appendFile = open('C:\TuDiabetes_Code\Diabetes_Text_New\CleanText.txt',
                          'a')
        appendFile.write(" " + r)
        appendFile.close()

lemma = WordNetLemmatizer()
exclude = set(string.punctuation)
stoplist = stopwords.words('english')
stoplist = stoplist + wordlist

stop = set(stoplist)
# stop= stop.append
# print type(stop)
# print(stop)
# exit(0)
stop_free = " ".join([i for i in doc.lower().split() if i not in stop])
punc_free = ''.join(ch for ch in stop_free if ch not in exclude)
normalized = " ".join(lemma.lemmatize(word) for word in punc_free.split())
return normalized
Example #47
0
def head_related(query, candidate):
    lmt = WordNetLemmatizer()

    sd = StanfordDependencies.get_instance(backend='subprocess')
    a = Annotator()
    synTree = a.getAnnotations(query)['syntax_tree']

    tokens = sd.convert_tree(synTree)
    queue = []
    for i, token in enumerate(tokens):
        if token[6] == 0:
            queue.append((i + 1, token))

    qHeadWords = []
    while queue != []:
        s = queue[0]
        queue.remove(s)
        flag = 0
        #print s[1][1], s[0]
        for i, word in enumerate(tokens):
            if word[6] == s[0]:
                flag = 1
                queue.append((i + 1, word))
        if flag == 1:
            qHeadWords.append(lmt.lemmatize(s[1][1], 'v'))

    synTree = a.getAnnotations(candidate)['syntax_tree']

    tokens = sd.convert_tree(synTree)
    queue = []
    for i, token in enumerate(tokens):
        if token[6] == 0:
            queue.append((i + 1, token))

    cHeadWords = []
    while queue != []:
        s = queue[0]
        queue.remove(s)
        flag = 0
        #print s[1][1], s[0]
        for i, word in enumerate(tokens):
            if word[6] == s[0]:
                flag = 1
                queue.append((i + 1, word))
        if flag == 1:
            cHeadWords.append(lmt.lemmatize(s[1][1], 'v'))

    queryRel = []
    for word in qHeadWords:
        for i, j in enumerate(wn.synsets(word)):
            for l in j.lemmas():
                queryRel.append(l.name())
            #queryRel.append(l.lemma_names() for l in j.hypernyms())
            for l in j.hypernyms():
                for k in l.lemma_names():
                    queryRel.append(k)
            for l in j.hyponyms():
                for k in l.lemma_names():
                    queryRel.append(k)

    candidateRel = []
    for word in cHeadWords:
        for i, j in enumerate(wn.synsets(word)):
            for l in j.lemmas():
                candidateRel.append(l.name())
            #queryRel.append(l.lemma_names() for l in j.hypernyms())
            for l in j.hypernyms():
                for k in l.lemma_names():
                    candidateRel.append(k)
            for l in j.hyponyms():
                for k in l.lemma_names():
                    candidateRel.append(k)

    exactHeadScore = 0
    count = 0
    for j in cHeadWords:
        count = count + 1
        for i in qHeadWords:
            #print i,j
            if i == j:
                exactHeadScore = exactHeadScore + 1
    try:
        exactHeadScore = exactHeadScore / count
    except:
        exactHeadScore = 0
    #print "Exact Head Score\n"

    relHeadScore = 0
    count = 0
    for j in candidateRel:
        count = count + 1
        if j in queryRel:
            relHeadScore = relHeadScore + 1

    try:
        relHeadScore = relHeadScore / count
    except:
        relHeadScore = 0
    #print "Relative Head Score\n"
    return relHeadScore, exactHeadScore
Example #48
0
for i in range(len(files)):
    file = open(path + files[i], 'r')
    text = file.read()
    file.close()
    books.append(text)

# corpuses = categorized + plaintext
corpuses = [books]
''' Reading Input File '''
file = open("test.txt", 'r')
text = file.read()

words = word_tokenize(text)
words = [w.lower() for w in words]

lmtzr = WordNetLemmatizer()
words = [lmtzr.lemmatize(w) for w in words]

count_of_words = len(words)
fd = nltk.FreqDist(words)
''' Blob Parsing '''
# blob = TextBlob(text)
# words = [n.lower() for n,t in blob.tags if t == 'NN' or t == 'NNP']
''' Stop Words Removal '''
stop_words = stop_words()
words = [w for w in words if w not in stop_words]
words = [w for w in words if w.isalpha() == True and len(w) > 1]
words = set(words)
words = list(words)

words_dict = {}
    text_file = "fables-01.sch"
    dep_file = "fables-01.sch.dep"
    q_file = "fables-01.questions.dep"

    # Read the dependency graphs into a list
    sgraphs = read_dep_parses(dep_file)
    qgraphs = read_dep_parses(q_file)

    # TODO: You may need to include different rules in find_answer() for
    # different types of questions. For example, the rule here is good for
    # answering "Where was the crow sitting?", but not necessarily the others.
    # You would have to figure this out like in the chunking demo
    for qgraph in qgraphs:
        print("Question:", pretty_question(qgraph), "?")
        answer = find_answer(qgraph, sgraphs)
        print("Answer:", answer)
        print()

    # example of how to use a lemmatizer
    print("\nLemma:")
    lmtzr = WordNetLemmatizer()
    for node in sgraphs[1].nodes.values():
        tag = node["tag"]
        word = node["word"]
        if word is not None:
            if tag.startswith("V"):
                print(lmtzr.lemmatize(word, 'v'))
            else:
                print(lmtzr.lemmatize(word, 'n'))
    print()
Example #50
0
from nltk.corpus import wordnet as wn
from nltk.stem.wordnet import WordNetLemmatizer
from nltk import word_tokenize, pos_tag
from collections import defaultdict

tag_map = defaultdict(lambda : wn.NOUN)
tag_map['J'] = wn.ADJ
tag_map['V'] = wn.VERB
tag_map['R'] = wn.ADV

text = "Another way of achieving this task. I ate an apple."
text2 = "ate"
tokens = word_tokenize(text)
lmtzr = WordNetLemmatizer()

for token, tag in pos_tag(tokens):
    lemma = lmtzr.lemmatize(token, tag_map[tag[0]])
    print(token, "=>", lemma, tag)
lemma2 = lmtzr.lemmatize(text2,tag_map['0'])
print(lemma2)
Example #51
0
df = pd.read_csv('breast-cancer-wisconsin.data')
X = X._get_numeric_data() 
# delete 'Survived', the response vector (Series)
X.drop('Survived', axis=1, inplace=True)
# we drop age for the sake of this example because it contains NaN in some examples
X.drop('Age', axis=1, inplace=True)


#Before Lenght of text
sum([len(x) for x in data]) #160

#Cleaning and Tokenizing data
stop = set(stopwords.words('english'))
exclude = set(string.punctuation)

lemma = WordNetLemmatizer()
def clean(doc):
    stop_free = " ".join([i for i in doc.lower().split() if i not in stop])
    punc_free = ''.join(ch for ch in stop_free if ch not in exclude)
    remove_numbers =  re.sub(r"[0-9]+", "", punc_free)
    normalized = " ".join(lemma.lemmatize(word) for word in remove_numbers.split())
    return normalized

texts = [text for text in data if len(text) > 2]
doc_clean = [clean(doc).split() for doc in texts]

all_words = sum(doc_clean,[])#removing the nested lists and making one list
#dictionary = corpora.Dictionary(doc_clean)
#doc_term_matrix = [dictionary.doc2bow(doc) for doc in doc_clean]

Example #52
0
 def lemmatize(self, tweet):
     lem = WordNetLemmatizer()
     words = tweet.split(" ")
     words = np.array([lem.lemmatize(word) for word in words])
     tweet = " ".join(words)
     return tweet
Example #53
0
def lemmatize(features):
    lemmatizer = WordNetLemmatizer()
    return [lemmatizer.lemmatize(feature) for feature in features]
Example #54
0
"""
Created on Wed Mar 22 14:12:34 2017

@author: Pooja Lahoti
"""

import nltk
import re
import json
from nltk.stem.wordnet import WordNetLemmatizer
from wordcloud import WordCloud
import matplotlib.pyplot as plt

from nltk.stem.lancaster import LancasterStemmer
ls = LancasterStemmer()
wnl = WordNetLemmatizer()
#reload(sys)
#sys.setdefaultencoding('utf8')
#from nltk.stem.porter import PorterStemmer
#ps = PorterStemmer()
#
#from nltk.stem.snowball import SnowballStemmer
#ss = SnowballStemmer("english")
#
#stopwords.append(unicode("trump", "utf-8"))
#stopwords.append(unicode("https", "utf-8"))
#stopwords.append(unicode("Donald", "utf-8"))
#stopwords.append(unicode("@realdonald", "utf-8"))
#stopwords.append(unicode("RT", "utf-8"))
#stopwords = set(stopwords)
#stopwords.update(("https","geo","trump"))
def clean_data(data):
    words_to_exclude = set(stopwords.words('english'))
    exclude = set(string.punctuation)
    lemma = WordNetLemmatizer()

    return [clean(doc, words_to_exclude, exclude, lemma) for doc in data]
Example #56
0
class tfidf:
    def __init__(self):
        # Data Fetch
        # data_folder = 'C:/Users/yashd/PycharmProjects/txt_search/'
        self.meta_cols = {"id": None, "original_title": None, "overview": None, "release_date": None}
        meta_data = pd.read_csv('movies_metadata.csv', usecols=self.meta_cols.keys(), index_col="id")
        self.meta_data = meta_data.dropna(subset=["overview"])
        self.N = self.meta_data.shape[0]

        # Pre-processing
        self.tokenizer = RegexpTokenizer(r'[a-zA-Z0-9]+')
        self.stopword = stopwords.words('english')
        self.stemmer = SnowballStemmer()
        self.lemmatizer = WordNetLemmatizer()
        
        self.inverted_index = {}
        self.document_vector = {}

        if os.path.isfile("invertedIndexPickle.pkl"):
            self.inverted_index = pickle.load(open('invertedIndexPickle.pkl', 'rb'))
            self.document_vector = pickle.load(open('documentVectorPickle.pkl', 'rb'))
        else:
            self.build()
            self.save()
    
    def build(self):
        self.create_inverted_index()
        self.build_doc_vector()

    def save(self):
        pickle.dump(self.inverted_index, open('invertedIndexPickle.pkl', 'wb+'))
        pickle.dump(self.document_vector, open('documentVectorPickle.pkl', 'wb+'))

    def create_inverted_index(self):
        for row in self.meta_data.itertuples():
            index = getattr(row, 'Index')
            data = []
            for col in self.meta_cols.keys():
                if col != "id":
                    col_values = getattr(row, col)
                    parameters = self.meta_cols[col]
                    if parameters is None:
                        data.append(col_values if isinstance(col_values, str) else "")
                    else:
                        col_values = ast.literal_eval(col_values if isinstance(col_values, str) else '[]')
                        if type(col_values) == bool:
                            continue
                        else:
                            for col_value in col_values:
                                for param in parameters:
                                    data.append(col_value[param])
                self.insert(index, self.pre_processing(' '.join(data)))

    def build_doc_vector(self):
        for token_key in self.inverted_index:
            token_values = self.inverted_index[token_key]
            idf = math.log10(self.N / token_values["df"])
            for doc_key in token_values:
                if doc_key != "df":
                    tf_idf = (1 + math.log10(token_values[doc_key])) * idf
                    if doc_key not in self.document_vector:
                        self.document_vector[doc_key] = {token_key: tf_idf, "_sum_": math.pow(tf_idf, 2)}
                    else:
                        self.document_vector[doc_key][token_key] = tf_idf
                        self.document_vector[doc_key]["_sum_"] += math.pow(tf_idf, 2)

        for doc in self.document_vector:
            tf_idf_vector = self.document_vector[doc]
            normalize = math.sqrt(tf_idf_vector["_sum_"])
            for tf_idf_key in tf_idf_vector:
                tf_idf_vector[tf_idf_key] /= normalize

    def insert(self, index, tokens):
        for token in tokens:
            if token in self.inverted_index:
                value = self.inverted_index[token]
                if index in value.keys():
                    value[index] += 1
                else:
                    value[index] = 1
                    value["df"] += 1
            else:
                self.inverted_index[token] = {index: 1, "df": 1}

    def pre_processing(self, data_string):
        tokens = self.tokenizer.tokenize(data_string)
        processed_data = []
        for t in tokens:
            if t not in self.stopword:
                processed_data.append(self.lemmatizer.lemmatize(t).lower())
        return processed_data

    def get_relevant_docs(self, query_list):
        relevant_docs = set()
        for query in query_list:
            if query in self.inverted_index:
                keys = self.inverted_index[query].keys()
                for key in keys:
                    relevant_docs.add(key)
        if "df" in relevant_docs:
            relevant_docs.remove("df")
        # print(relevant_docs)
        return relevant_docs

    def build_query_vector(self, processed_query):
        query_vector = {}
        tf_vector = {}
        idf_vector = {}
        sum = 0
        for token in processed_query:
            if token in self.inverted_index:
                # tf_idf = (1 + math.log10(processed_query.count(token))) * math.log10(N/inverted_index[token]["df"])
                tf = (1 + math.log10(processed_query.count(token)))
                tf_vector[token] = tf
                idf = (math.log10(self.N / self.inverted_index[token]["df"]))
                idf_vector[token] = idf
                tf_idf = tf * idf
                query_vector[token] = tf_idf
                sum += math.pow(tf_idf, 2)
        sum = math.sqrt(sum)
        for token in query_vector:
            query_vector[token] /= sum
        return query_vector, idf_vector, tf_vector

    def similarity(self, relevant_docs, query_vector, idf_vector, tf_vector):
        FinalScore = {}
        IdfScore = {}
        TfScore = {}
        for doc in relevant_docs:
            score_idf = 0
            score_tf = 0
            score_tf_idf = 0

            for token in query_vector:
                score_final += query_vector[token] * (
                    self.document_vector[doc][token] if token in self.document_vector[doc] else 0)

            for token in query_vector:
                score_tf_idf = query_vector[token] * (
                    self.document_vector[doc][token] if token in self.document_vector[doc] else 0)
                score_tf_idf_term[token] = score_tf_idf
                score_tf_idf_term_keys = list(score_tf_idf_term.keys())
                score_tf_idf_term_values = list(score_tf_idf_term.values())

                final_score_tf_idf_term = list(zip(score_tf_idf_term_keys, score_tf_idf_term_values))

                final_TermTf = list(zip(TermTf_keys, TermTf_values))

            FinalScore[doc] = score_final
            IdfScore[doc] = score_idf
            TfScore[doc] = score_tf
            
            tf_idf_term_new[doc] = final_score_tf_idf_term
        sorted_FinalScore = sorted(FinalScore.items(), key=operator.itemgetter(1), reverse=True)

        return sorted_FinalScore[:50], tf_term_new, idf_term_new, tf_idf_term_new

    def get_movie_info(self, sorted_score_list, tf_new, idf_new, tf_idf_new):
        result = []
        for entry in sorted_score_list:
            doc_id = entry[0]
            row = self.meta_data.loc[doc_id]
            info = (row["original_title"],
                    row["overview"] if isinstance(row["overview"], str) else "", entry[1], idf_new[doc_id],
                    tf_new[doc_id], tf_idf_new[doc_id], row["release_date"])
            result.append(info)
        new_score = None
        # print(result[0:5])
        return result
class ToxicComment:
    _eng_stopwords = set(stopwords.words("english"))
    _lemmatizer = WordNetLemmatizer()
    _tokenizer = TweetTokenizer()
    _appos = {
        "aren't": "are not",
        "can't": "cannot",
        "couldn't": "could not",
        "didn't": "did not",
        "doesn't": "does not",
        "don't": "do not",
        "hadn't": "had not",
        "hasn't": "has not",
        "haven't": "have not",
        "he'd": "he would",
        "he'll": "he will",
        "he's": "he is",
        "i'd": "I would",
        "i'd": "I had",
        "i'll": "I will",
        "i'm": "I am",
        "isn't": "is not",
        "it's": "it is",
        "it'll": "it will",
        "i've": "I have",
        "let's": "let us",
        "mightn't": "might not",
        "mustn't": "must not",
        "shan't": "shall not",
        "she'd": "she would",
        "she'll": "she will",
        "she's": "she is",
        "shouldn't": "should not",
        "that's": "that is",
        "there's": "there is",
        "they'd": "they would",
        "they'll": "they will",
        "they're": "they are",
        "they've": "they have",
        "we'd": "we would",
        "we're": "we are",
        "weren't": "were not",
        "we've": "we have",
        "what'll": "what will",
        "what're": "what are",
        "what's": "what is",
        "what've": "what have",
        "where's": "where is",
        "who'd": "who would",
        "who'll": "who will",
        "who're": "who are",
        "who's": "who is",
        "who've": "who have",
        "won't": "will not",
        "wouldn't": "would not",
        "you'd": "you would",
        "you'll": "you will",
        "you're": "you are",
        "you've": "you have",
        "'re": " are",
        "wasn't": "was not",
        "we'll": " will",
        "didn't": "did not"
    }

    @staticmethod
    def _clean(comment):
        # make all characters lower cased
        comment = comment.lower()
        # remove new line character
        comment = re.sub('\\n', ' ', comment)
        # remove ip addresses
        comment = re.sub('\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}', '', comment)
        # remove usernames
        comment = re.sub('\[\[.*\]', '', comment)
        # split the comment into words
        words = ToxicComment._tokenizer.tokenize(comment)
        # replace that's to that is by looking up the dictionary
        words = [
            ToxicComment._appos[word] if word in ToxicComment._appos else word
            for word in words
        ]
        # replace variation of a word with its base form
        words = [
            ToxicComment._lemmatizer.lemmatize(word, "v") for word in words
        ]
        # eliminate stop words
        words = [w for w in words if not w in ToxicComment._eng_stopwords]
        # now we will have only one string containing all the words
        clean_comment = " ".join(words)
        # remove all non alphabetical characters
        clean_comment = re.sub("\W+", " ", clean_comment)
        clean_comment = re.sub("  ", " ", clean_comment)
        return clean_comment

    def __init__(self, csv_row, glove_model, comment_max_length):
        self._id = csv_row['id']
        self._comment_text = csv_row['comment_text']
        self._tokens = word_tokenize(
            ToxicComment._clean(csv_row['comment_text']))
        self._labels = np.array([
            float(csv_row['toxic']),
            float(csv_row['severe_toxic']),
            float(csv_row['obscene']),
            float(csv_row['threat']),
            float(csv_row['insult']),
            float(csv_row['identity_hate'])
        ])
        self._indexed_tokens = np.zeros(shape=[comment_max_length],
                                        dtype=np.int32)
        self._token_count = min(len(self._tokens), comment_max_length)
        for i, token in enumerate(self._tokens):
            if i < comment_max_length:
                token = token.lower()
                index = glove_model.token_to_embedding['something'].index
                if token in glove_model.token_to_embedding:
                    index = glove_model.token_to_embedding[token].index
                self._indexed_tokens[i] = index
            else:
                break

    @property
    def tokens(self):
        return self._tokens

    @property
    def labels(self):
        return self._labels

    @property
    def indexed_tokens(self):
        return self._indexed_tokens

    @property
    def token_count(self):
        return self._token_count

    @property
    def id(self):
        return self._id

    @property
    def comment_text(self):
        return self._comment_text
def main():
    parser = argparse.ArgumentParser(description='PyTorch PennTreeBank RNN/LSTM Language Model')
    parser.add_argument('--data', type=str, default='../data/',
                        help='location of the data corpus')
    parser.add_argument('--presaved', action='store_true',
                        help='use presaved data')
    parser.add_argument('--glovedata', type=str, default='../data/glove.6B',
                        help='location of the pretrained glove embeddings')
    parser.add_argument('--din', type=int, default=30,
                        help='length of LSTM')
    parser.add_argument('--demb', type=int, default=100,
                        help='size of word embeddings')
    parser.add_argument('--dhid', type=int, default=100,
                        help='humber of hidden units per layer')
    parser.add_argument('--dout', type=int, default=2,
                        help='number of output classes')
    parser.add_argument('--nlayers', type=int, default=1,
                        help='number of layers')
    parser.add_argument('--lr', type=float, default=0.001,
                        help='initial learning rate')
    parser.add_argument('--clip', type=float, default=0.25,
                        help='gradient clipping')
    parser.add_argument('--embinit', type=str, default='random',
                        help='embedding weight initialization type')
    parser.add_argument('--decinit', type=str, default='random',
                        help='decoder weight initialization type')
    parser.add_argument('--hidinit', type=str, default='random',
                        help='recurrent hidden weight initialization type')
    parser.add_argument('--dropout', type=float, default=0.0,
                        help='dropout applied to layers (0 = no dropout)')
    parser.add_argument('--epochs', type=int, default=40,
                        help='upper epoch limit')
    parser.add_argument('--batchsize', type=int, default=20, metavar='N',
                        help='batch size')
    parser.add_argument('--seed', type=int, default=3,
                        help='random seed')
    parser.add_argument('--vocabsize', type=int, default=200000,
                        help='random seed')
    parser.add_argument('--optimizer', action='store_true',
                        help='use ADAM optimizer')
    parser.add_argument('--pipeline', action='store_true',
                        help='use pipeline file')
    parser.add_argument('--psw', type=int, default=1,
                        help='remove stop words')
    parser.add_argument('--ppunc', action='store_true',
                        help='remove punctuation')
    parser.add_argument('--pntok', action='store_true',
                        help='use number tokens')
    parser.add_argument('--pkq', action='store_true',
                        help='keep question words')
    parser.add_argument('--stem', action='store_true',
                        help='use stemmer')
    parser.add_argument('--lemma', action='store_true',
                        help='use lemmatizer')
    parser.add_argument('--freezeemb', action='store_false',
                        help='freezes embeddings')
    parser.add_argument('--cuda', action='store_true',
                        help='use CUDA')
    parser.add_argument('--loginterval', type=int, default=100, metavar='N',
                        help='report interval')
    parser.add_argument('--save', type=str,  default='',
                        help='path to save the final model')
    args = parser.parse_args()


    if not args.presaved:
        pipe = None
        if args.pipeline:
            stemmer, lemmatizer = None, None
            if args.stem:
                stemmer = SnowballStemmer('english')
            elif args.lemma:
                lemmatizer = WordNetLemmatizer()

            pipe = functools.partial(pipeline, 
                                    rm_stop_words=args.psw, 
                                    rm_punc=args.ppunc, 
                                    number_token=args.pntok, 
                                    keep_questions=args.pkq,
                                    stemmer=stemmer,
                                    lemmatizer=lemmatizer)

        corpus = TacoText(args.vocabsize, lower=True, vocab_pipe=pipe)
        print('Loading Data')
        # train_data = pd.read_csv(args.data)
        #Shuffle order of training data

        # train_data = train_data.reindex(np.random.permutation(train_data.index))
        # val_data = train_data.iloc[int(len(train_data) * 0.9):]
        # train_data = train_data.iloc[:int(len(train_data) * 0.9)]
        train_data = pd.read_csv('../data/train_data_shuffle.csv')
        val_data = pd.read_csv('../data/val_data_shuffle.csv')

        print('Cleaning and Tokenizing')
        q1, q2, y = clean_and_tokenize(train_data, corpus)
        q1_val, q2_val, y_val = clean_and_tokenize(val_data, corpus)

        train_feat = list(map(feature_gen, zip(q1, q2)))
        val_feat = list(map(feature_gen, zip(q1_val, q2_val)))
        scalar = preprocessing.StandardScaler()
        train_feat = scalar.fit_transform(train_feat)
        val_feat = scalar.transform(val_feat)

        print('Piping Data')
        q1 = corpus.pipe_data(q1)
        q2 = corpus.pipe_data(q2)
        q1_val = corpus.pipe_data(q1_val)
        q2_val = corpus.pipe_data(q2_val)

        corpus.gen_vocab(q1 + q2 + q2_val + q1_val)

        n_feat = train_feat.shape[1]
        d_in = args.din
        feat_max = int(np.max([n_feat, d_in]))

        X = torch.Tensor(len(train_data), 1, 3, feat_max)
        X[:, 0, 0, :] = torch.from_numpy(corpus.pad_numericalize(q1, feat_max)).long()
        X[:, 0, 1, :] = torch.from_numpy(corpus.pad_numericalize(q2, feat_max)).long()
        X[:, 0, 2, :n_feat] = torch.from_numpy(np.array(train_feat))
        y = torch.from_numpy(np.array(y)).long()

        X_val = torch.Tensor(len(val_data), 1, 3, feat_max)
        X_val[:, 0, 0, :] = torch.from_numpy(corpus.pad_numericalize(q1_val, feat_max)).long()
        X_val[:, 0, 1, :] = torch.from_numpy(corpus.pad_numericalize(q2_val, feat_max)).long()
        X_val[:, 0, 2, :n_feat] = torch.from_numpy(np.array(val_feat))
        y_val = torch.from_numpy(np.array(y_val)).long()

        torch.save(X, '../data/X_featd.t')
        torch.save(y, '../data/y_featd.t')
        torch.save(X_val, '../data/X_val_featd.t')
        torch.save(y_val, '../data/y_val_featd.t')
        with open('../data/corpus_featd.pkl', 'wb') as corp_f:
            pkl.dump(corpus, corp_f, protocol=pkl.HIGHEST_PROTOCOL)

    else:
        n_feat = 22
        d_in = args.din
        print('Loading Presaved Data')
        X = torch.load(args.data + 'X_featd.t')
        y = torch.load(args.data + 'y_featd.t')
        X_val = torch.load(args.data + 'X_val_featd.t')
        y_val = torch.load(args.data + 'y_val_featd.t')
        with open('../data/corpus_featd.pkl', 'rb') as f:
            corpus = pkl.load(f)


    if args.cuda:
        X, y = X.cuda(), y.cuda()
        X_val, y_val = X_val.cuda(), y_val.cuda()

    print('Generating Data Loaders')
    #X.size len(train_data),1,2,fix_length
    train_dataset = TensorDataset(X, y)
    train_loader = DataLoader(train_dataset, 
                                batch_size=args.batchsize, 
                                shuffle=True)
    valid_loader = DataLoader(TensorDataset(X_val, y_val),
                                batch_size=args.batchsize,
                                shuffle=False)

    ntokens = len(corpus)
    glove_embeddings = None
    if args.embinit == 'glove':
        assert args.demb in (50, 100, 200, 300)
        glove_embeddings = get_glove_embeddings(args.glovedata, corpus.dictionary.word2idx, ntokens, args.demb)

    model = LSTMModelMLPFeatDist(args.din, args.dhid, args.nlayers, args.dout, args.demb, n_feat, args.vocabsize, 
                        args.dropout, args.embinit, args.hidinit, args.decinit, glove_embeddings,
                        args.cuda)

    if args.cuda:
        model.cuda()

    criterion = nn.NLLLoss()
    optimizer = torch.optim.Adam(model.parameters(), lr=args.lr)

    model_config = '\t'.join([str(x) for x in (torch.__version__, args.clip, args.nlayers, args.din, args.demb, args.dhid, 
                        args.embinit, args.decinit, args.hidinit, args.dropout, args.optimizer, args.lr, args.vocabsize,
                        args.pipeline, args.psw, args.ppunc, args.pntok, args.pkq, args.stem, args.lemma)])

    print('Pytorch | Clip | #Layers | InSize | EmbDim | HiddenDim | EncoderInit | DecoderInit | WeightInit | Dropout | Optimizer| LR | VocabSize | pipeline | stop | punc | ntoken | keep_ques | stem | lemma')
    print(model_config)

    # best_val_acc = 0.78
    best_ll = 0.5
    for epoch in range(args.epochs):
        model.train()
        total_cost = 0
        start_time = time.time()
        cur_loss = 0
        for ind, (qs, duplicate) in enumerate(train_loader):
            model.zero_grad()
            pred = model(qs[:, 0, 0, :d_in].long(), qs[:, 0, 1, :d_in].long(), qs[:, 0, 2, :n_feat])
            if args.cuda:
                pred = pred.cuda()
                duplicate = duplicate.cuda()
            duplicate = Variable(duplicate)
            loss = criterion(pred, duplicate)
            loss.backward()
            clip_grad_norm(model.parameters(), args.clip)

            if optimizer:
                optimizer.step()
            else:
                for p in model.parameters():
                    p.data.add_(-args.lr, p.grad.data)

            total_cost += loss.data[0]
            cur_loss += loss.data[0]

            if ind % args.loginterval == 0 and ind > 0:
                cur_loss = loss.data[0] / args.loginterval
                elapsed = time.time() - start_time
                print('| Epoch {:3d} | {:5d}/{:5d} Batches | ms/batch {:5.2f} | '
                        'Loss {:.6f}'.format(
                            epoch, ind, len(X) // args.batchsize,
                            elapsed * 1000.0 / args.loginterval, cur_loss))
                start_time = time.time()
                cur_loss = 0

        model.eval()

        train_acc, train_ll = evaluate(model, train_loader, args.cuda, d_in, n_feat)
        val_acc, val_ll = evaluate(model, valid_loader, args.cuda, d_in, n_feat)
        # if args.save and (val_acc > best_val_acc):
        if args.save and (val_ll < best_ll):
            with open(args.save + '_corpus.pkl', 'wb') as corp_f:
                pkl.dump(corpus, corp_f, protocol=pkl.HIGHEST_PROTOCOL)
            torch.save(model.cpu(), args.save)
            torch.save(model.cpu().state_dict(), args.save + ".state_dict")
            with open(args.save + ".state_dict.config", "w") as f:
                f.write(model_config)
            best_ll = val_ll
            if args.cuda:
                model.cuda()


        print('Epoch: {} | Train Loss: {:.4f} | Train Accuracy: {:.4f} | Val Accuracy: {:.4f} | Train LL: {:.4f} | Val LL: {:.4f}'.format(
            epoch, total_cost, train_acc, val_acc, train_ll, val_ll))
        print('-' * 89)
Example #59
0
import networkx as nx
import numpy as np
from copy import copy
import string

from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.corpus import stopwords
from nltk.stem.wordnet import WordNetLemmatizer

from scipy import argmax
from scipy.spatial.distance import cosine

from galaxy.vector import vectorize

LEMMATIZER = WordNetLemmatizer()

STOPWORDS = set(list(string.punctuation) + stopwords.words('english'))

# Level Definitions
HIGH = 0
MED = 1
LOW = 2

WEIGHTS = {HIGH: 2.0, MED: 1.5, LOW: 1.0}


def sentencize(plain_text):
    sentences = sent_tokenize(plain_text)
    sentences = [tokenize(s) for s in sentences]
    return [RankedSentence(sentence=s, level=LOW) for s in sentences]
Example #60
0
# 保留的词性
expected_tags = ["VB", "VBD", "VBG", "VBN", "VBP", "VBZ", \
                        "NN", "NNS", "NNP", "NNPS", \
                        "JJ", "JJR", "JJS"]

# 加载停词表
fs = open('../../stopwords_en.txt')
stopwords = fs.read()
swlist = stopwords.splitlines()
fs.close()

print("step1:加载语料库及预处理")
timestamp = time.time()
corpus = []  #存放语料库,每个元素代表一篇文档
if not os.path.exists('../../corpus/segwords.txt'):
    lemmatizer = WordNetLemmatizer()
    with open('../../corpus/news.txt', 'r') as df:
        for line in df:
            if len(line.strip()) != 0:
                words = word_tokenize(line.strip())
                tags = pos_tag(words)
                seglist = []
                for i in range(len(words)):
                    if tags[i][1] in expected_tags and words[
                            i] not in swlist and words[i].isalpha():
                        taghead = tags[i][1][0].lower()
                        # {ADJ:a, ADJ_SAT:s, ADV:r, NOUN:n or VERB:v} 词形还原
                        seglist.append(
                            lemmatizer.lemmatize(
                                words[i],
                                wordnet.ADJ if taghead == 'j' else taghead))