Esempio n. 1
0
    def extrapolate(self, sent):

        # tags the part of speech in each word
        tagged = pos_tag(word_tokenize(sent))

        tag_list = []
        for item in tagged:
            tag_list.append(list(item))

        # puts nouns and verbs in their base form
        for idx, item in enumerate(tag_list):
            if item[1][0] == 'V':
                tag_list[idx][0] = wnl().lemmatize(item[0],'v')
            elif item[1] == 'NN' or item[1] == 'NNS':
                tag_list[idx][0] = wnl().lemmatize(item[0],'n')

        synonyms = [[] for i in range(len(tag_list))]

        # finds synonyms for each wnoun, verb, adj in tag_list -> puts in corresponding index in synonyms
        for idx, item in enumerate(tag_list):
            if item[1][0] == 'V':
                synonyms[idx] = self.find_synonyms(item[0], wordnet.VERB)
                #for v in synonyms[idx]:
                 #   v = en.verb.past(v)
            elif item[1] == 'NN' or item[1] == 'NNS':
                synonyms[idx] = self.find_synonyms(item[0], wordnet.NOUN)
            elif item[1][0] == 'J':
                synonyms[idx] = self.find_synonyms(item[0], wordnet.ADJ)

        # gets rid of duplicates
        for si, s in enumerate(synonyms):
            s = list(set(s))
            # print(tag_list[si][0], ": ", s)

        self.sent_syns = synonyms

        search_sent = []
        # creates a list of similar sentences to search for
        for idx, item in enumerate(tag_list):
            # looks for synonyms at the corresponding index
            for s in synonyms[idx]:
                temp = sub(r"\b%s\b" %item[0], s, sent)
                search_sent.append(temp)

        # will get rid of duplicates once i make it hashable
        search_sent = list(set(search_sent))

        # print("\nSample list of synonymous sentences:")
        # for i in range(min(len(search_sent), 20)):
        #     print(search_sent[i])

        return search_sent
Esempio n. 2
0
  def calc_fd(self):
    """
    This function calculates the frequency distributions of unigrams, bigrams
    and trigrams from all the scraped ads (combined). It also finds the bigrams
    and trigrams that are likely combinations of words (i.e. those that make 
    sense).
    Output:
      fd_mono - unigram frequency distribution.
      fd_bi - bigram frequency distribution.
      fd_tri - trigram frequency distribution.
      goodbi - bigrams that have high measure of pointwise mutual information
      goodtri - trigrams that have high measure of pointwise mutual information
    """
    all_mono = []
    all_bi = []
    all_tri = []
    lmt = wnl()
    combined_ads = []

#Concatenante unigrams, bigrams and trigrams from different ads together so
#that we don't need to make frequency distributions for each one (we only care
#about the collective anyway).
    for ad in self.ads:
      combined_ads = combined_ads+ad

      btemp = nltk.bigrams(ad)
      all_bi += btemp
      ttemp = nltk.trigrams(ad)
      all_tri += ttemp
#Lemmatize unigrams (this prevents things like cat and cats from being
#counted as different words.
      mono_stem = [lmt.lemmatize(w) for w in ad]
      all_mono += mono_stem

#Do some collocation analysis using pmi - pointwise mutual information.
#This measures how likely it is that a bigram/trigram of words actually make
#sense together.
    bigram_measures = nltk.collocations.BigramAssocMeasures()
    trigram_measures = nltk.collocations.TrigramAssocMeasures()

    finder = BigramCollocationFinder.from_words(combined_ads)
#Only keep grams that occur >25 times and pmi>100
    finder.apply_freq_filter(0.5*self.n_ad)
    goodbi = finder.nbest(bigram_measures.pmi, 100)

    finder = TrigramCollocationFinder.from_words(combined_ads)
    finder.apply_freq_filter(0.5*self.n_ad)
    goodtri = finder.nbest(trigram_measures.pmi, 100)

#Calculate frequency distributions.
    fd_mono = nltk.FreqDist(all_mono)
    fd_bi = nltk.FreqDist(all_bi)
    fd_bi = KEY_TUPLE_TO_LIST(fd_bi)
    fd_tri = nltk.FreqDist(all_tri)
    fd_tri = KEY_TUPLE_TO_LIST(fd_tri)

    goodbi = TUPLE_TO_LIST(goodbi)
    goodtri = TUPLE_TO_LIST(goodtri)

    return fd_mono, fd_bi, fd_tri, goodbi, goodtri
Esempio n. 3
0
  def calc_all_fd(self):
    """
    This function creates a mega-frequency distribution dictionary. It takes
    a list of frequency distributions (from each ad) for each job and puts all
    of this info into a single dictionary keyed to job name.
    Result:
      self.fd_all - the mega-dictionary described above.
    """
    fd_all = []
    lmt = wnl()
    for ad in self.ads:
      btemp = nltk.bigrams(ad)
      fd_btemp = nltk.FreqDist(btemp)
      fd_btemp = KEY_TUPLE_TO_LIST(fd_btemp)

      ttemp = nltk.trigrams(ad)
      fd_ttemp = nltk.FreqDist(ttemp)
      fd_ttemp = KEY_TUPLE_TO_LIST(fd_ttemp)

      mono_stem = [lmt.lemmatize(w) for w in ad]
      fd_mtemp = nltk.FreqDist(mono_stem)

      fd_all.append( dict(fd_mtemp.items()+fd_btemp.items()+fd_ttemp.items()) )

    self.fd_all = fd_all
Esempio n. 4
0
def clean(comment, dtm=False, lemmatize=False, stop_words=False):

    no_http = re.sub(r'''https?://[\w/._-]+''', '', comment)
    no_at = re.sub(r'@\w+', '', no_http)
    no_punc = re.sub(f'[{string.punctuation}]', '', no_at)
    no_nums = re.sub(r'[0-9]+', '', no_punc)
    cleaned = nltk.word_tokenize(no_nums)

    if lemmatize:
        lemma = wnl()
        cleaned = [lemma.lemmatize(i) for i in cleaned]

    if stop_words:

        #lang = language(comment)
        stop_words = set(stopwords.words('english'))
        cleaned = [i for i in cleaned if i not in stop_words and len(i) > 1]

    if dtm:

        cv = cvec()
        cv_df = cv.fit_transform(cleaned)
        cleaned = pd.DataFrame(cv_df.toarray(), columns=cv.get_feature_names())

    return cleaned
Esempio n. 5
0
    def lemmatize_nouns_in_tweet(self, tweet):
        #print('Nouns lemmatization..., end= ' ')
        #timestamp1 = time.time()

        l = ' '.join([wnl().lemmatize(t) for t in tweet.split()])

        #timestamp2 = time.time()
        #print('{0:.2f} seconds'.format(timestamp2 - timestamp1))

        return l
Esempio n. 6
0
 def lemmatize(self, token_list):
     lemmatizer = wnl()
     lemmed_tokens = []
     tagged = nltk.pos_tag(token_list)
     for word, pos_tag in tagged:
         word = ''.join(
             [letter for letter in word if letter in self.alphabet])
         lem_word = lemmatizer.lemmatize(word,
                                         pos=self._get_wordnet_pos(pos_tag))
         lemmed_tokens.append(lem_word)
     return lemmed_tokens
Esempio n. 7
0
def gen_text_score(word_one, word_two):
    print "Generating similarity score"
    jacard_coefficient = 0.0

    lemma_for_senta = [
        wnl().lemmatize(word.lower().strip(), tag)
        for word, tag in tags_for_sent(sentence_a)
        if check_for_tags_and_stopwords(word, tag)
    ]
    lemma_for_sentb = [
        wnl().lemmatize(word.lower().strip(), tag)
        for word, tag in tags_for_sent(sentence_b)
        if check_for_tags_and_stopwords(word, tag)
    ]

    jacard_coefficient = len(
        set(lemma_for_senta).intersection(lemma_for_sentb)) / float(
            len(set(lemma_for_senta).union(lemma_for_sentb)))

    return jacard_coefficient
Esempio n. 8
0
 def __ngramwords(self, txtdata):
     stop_words = set(stopwords.words('english'))  # set stop words type
     # temp words list
     temp_wordlist = []
     # analysis words with nltk
     senlist = nltk.sent_tokenize(txtdata, language="english")
     wordlist = []
     for sent in senlist:
         wordlist.append(nltk.word_tokenize(sent, language="english"))
     # add tag for each words
     tags = []
     for tokens in wordlist:
         tags.append(nltk.pos_tag(tokens, lang='eng'))
     # filter words by vocabulary type
     for sent in tags:
         for words in sent:
             # filter stop words
             if words[0] in stop_words:
                 continue
             # print(words[0] + ":" + words[1])
             if words[1][:2] in self.word_type_list_In and words[1] not in self.word_type_list_Ex \
                     and str(words[0]).lower() not in self.word_list_Ex:
                 if words[1][:2] == "VB":  # change verb to parent tense
                     add_word = wnl().lemmatize(words[0], "v")
                 elif words[1][:2] == "NN":  # change noun to morphy
                     add_word = wordnet.morphy(words[0])
                 elif words[1][:3] in ("JJR",
                                       "JJS"):  # change adj to ordinal
                     add_word = wnl().lemmatize(word=words[0],
                                                pos=wordnet.ADJ)
                 elif words[1][:3] in ("RBR",
                                       "RBS"):  # change adv to ordinal
                     add_word = wnl().lemmatize(word=words[0],
                                                pos=wordnet.ADV)
                 else:
                     add_word = words[0]
                 temp_wordlist.append(add_word)
     # create n-gram words
     ngram_wordslist = self.__word_grams(temp_wordlist, 1, 3)
     return ngram_wordslist
 def pickNounAndLemmatize(self, sentences):
     wordList = []
     for sentence in sentences:
         temp = []
         words = sentence.split(" ")
         words = list(filter(None, words))
         tag_tuples = nltk.pos_tag(words)
         for tup in tag_tuples:
             if 'NN' in tup[1]:
                 #regex #lemmatize #stopwordCheck
                 word = re.sub('[^A-Za-z]+', '', tup[0])
                 word = word.lower()
                 lmtzr = wnl()
                 word = lmtzr.lemmatize(word)
                 if (self.not_stopword(word) and len(word) > 2):
                     temp.append(word)
         wordList.append(temp)
     return wordList
Esempio n. 10
0
def process_text(message: str):
    # Create Stop words
    stop_words = (stopwords.words("english"))
    stop_words += CUSTOMIZED_STOP_WORDS

    # Remove punctuation
    text = re.sub("[^a-zA-Z]", ' ', message)

    # Convert to lowercase
    text = text.lower()

    # remove digits, characters
    text = re.sub('(\\d|\\W)+', " ", text)

    # Create tokens
    text = list(set(nltk.word_tokenize(text)))
    text = [wnl().lemmatize(word) for word in text if not word in stop_words]

    for i in range(len(text)):
        if text[i] in similar_words.similar_words_dict:
            text[i] = similar_words.similar_words_dict[text[i]]

    return text
Esempio n. 11
0
def process(question):
  if const.showlog==1:
    print("[wiki-buddy] Parsing user question...")
  question=question.replace("?","")
  spquestion=question.split()
  tagwords=pos_tag(spquestion)
  verbindex=[]
  caveat=""
  splitnouns=False
  rawqtype=[word for word, pos in tagwords if pos=='WDT' or pos=='WP' or pos=='WP$' or pos=='WRB']
  rawverb=[word for word, pos in tagwords if pos=='VB' or pos=='VBD' or pos=='VBG' or pos=='VBN' or pos=='VBP' or pos=='VBZ' or pos=='TO']
  for a in range(0,len(spquestion)):
    for n in range(0,len(rawqtype)):
      if rawqtype[n] in spquestion[a]:
        spquestion[a]=""
    for n in range(0,len(rawverb)):
      if rawverb[n] in spquestion[a]:  
        spquestion[a]=""
        verbindex.append(a)
  if verbindex[0]>1:
    for n in range(1,verbindex[0]):
      caveat=caveat+" "+spquestion[n]
    caveat=caveat.strip()
    question=question.replace(caveat," ")
    spquestion=question.split()
    tagwords=pos_tag(spquestion)
  pindex=[word for word, pos in tagwords if pos=='NNP' or pos=='NNPS']
  if len(pindex)>0:
    rawnoun=[word for word, pos in tagwords if pos=='NN' or pos=='NNS' or pos=='NNP' or pos=='NNPS' or pos=='IN' or pos=='CC' or pos=='CD' or pos=='JJ' or pos=='JJR' or pos=='JJS']
  else:
    rawnoun=[word for word, pos in tagwords if pos=='NN' or pos=='NNS' or pos=='IN' or pos=='CC' or pos=='CD' or pos=='JJ' or pos=='JJR' or pos=='JJS']
  useless=[word for word, pos in tagwords if pos=='DT']
  for a in range(0,len(spquestion)):
    for n in range(0,len(rawnoun)):
      if rawnoun[n] in spquestion[a]:
        spquestion[a]=""
    for n in range(0,len(useless)):
      if useless[n] in spquestion[a]:
        spquestion[a]=""
  question=" ".join(spquestion)
  qtype=" ".join(rawqtype)
  for n in range(0,len(rawverb)):
    if rawverb[n] in const.omitverblist:
      rawverb[n]=""
  verb=" ".join(rawverb)
  keyword=" ".join(rawnoun)
  verb=verb.strip()
  caveat=caveat+" "+verb
  qtype=qtype.strip()
  keyword=keyword.strip()
  caveat=caveat.strip()
  qtype="["+qtype.lower()+"]"
  
  keyword=wnl().lemmatize(keyword)
  caveat=caveat.strip()

  if caveat=="" and " " in keyword:
    keyword,caveat=splitkey(keyword)

  tokenizer=nltk.data.load('tokenizers/punkt/english.pickle')
  if const.showlog==1:
    print("[wiki-buddy] Reading Wikipedia articles for keyword '"+keyword+"'...")
  display_url=""
  try:
    rawdata=wikipedia.page(title = keyword,auto_suggest = True)
    fulltext=rawdata.content
    display_url=rawdata.url
  except:
    fulltext=""
  try:
    summary=wikipedia.summary(keyword)
  except:
    summary=""
  try:
    categories=wikipedia.page(title = keyword,auto_suggest = True).categories
  except:
    categories=[]
  if fulltext!="":
    sentences=nltk.sent_tokenize(fulltext)
    for n in range(0,len(const.omitpuctlist)):
      fulltext=fulltext.replace(const.omitpuctlist[n],"")
  elif summary!="":
    sentences=nltk.sent_tokenize(summary)
    for n in range(0,len(const.omitpuctlist)):
      summary=summary.replace(const.omitpuctlist[n],"")
  else:
    qtype="[null]"
    sentences=[]
  words=fulltext.split()
  for n in range(0,len(words)):
    words[n] = wnl().lemmatize(words[n])
  if const.showlog==1:
    print("[wiki-buddy] User question was processed into the following chunks.")
    print("             Question type: "+qtype)
    print("             Keyword: "+keyword)
    print("             Caveat: "+caveat)
  return qtype,keyword,caveat,fulltext,summary,sentences,words,categories,display_url
Esempio n. 12
0
def nltk_stemming(word):
    lmtzr = wnl()
    return lmtzr.lemmatize(word)
Esempio n. 13
0
    def remove_junk(tm, tb, tt, gb, gt):
      """
      This code cleans the keyword list (i.e. removes similar words, etc.)
      Input:
        tm - dictionary of unigram relevance scores.
        tb - dictionary of bigram relevance scores.
        tt - dictionary of trigram relevance scores.
        gb - bigrams with high pmi scores.
        gt - trigrams with high pmi scores.
      Output:
        tm, tb and tt cleaned.
      """
#Filter out all of the bigrams/trigrams with low pmi scores (i.e. only keep the
#ones in gb and gt.
      tempb = {}
      for term in gb: tempb[term] = tb.get(term,0)
      tb = tempb

      tempt = {}
      for term in gt: tempt[term] = tt.get(term,0)
      tt = tempt

#Get rid of terms that contain the job name. 
      job_name = self.name
      if(' ' in str(job_name)):
        sjn = job_name.split()
        for w in sjn:
          tm.pop(w,None)
        if(len(sjn)==2):
          tb.pop(job_name,None)
        elif(len(sjn)==3):
          tt.pop(job_name,None)
      else:
        tm.pop(job_name,None)
    
#Set the stopwords + other words that seem to come up often but obviously make
#no sense.
      sw = stopwords.words('english')+\
           ['yes', 'no', 'your', 'youll', 'benefits', 'go', 'river', 'amp',\
            'us', 'e', 'permit','requires','work','types', 'dot', 'without',\
            'plus', 'must', 'way', 'new', 'job', 'click', 'http', 'winning',\
            '/', 'intended', 'youre', 'location', 'conditions', 'sized',\
            'use', 'may', 'june', 'year', 'o', 'g', 'n', 'take', 'right',\
            'term', 'always', 'existing', 'onto', 'youve', 'experience',\
            'really', 'ensure', 'difference', 'ensures', 'v', 'years', 'onto']
      monopop = []
      bipop = []
      tripop = []
  
#Remove stop words (or terms containing stop words).
      for key in tm.keys():
        if(key in sw):
          monopop.append(key)
      for key in tb.keys():
        k = key.split()
        for w in k:
          if(w in sw):
            bipop.append(key)
            break
      for key in tt.keys():
        k = key.split()
        for w in k:
          if(w in sw):
            tripop.append(key)
            break
      for p in set(monopop): tm.pop(p,None)
      for p in set(bipop): tb.pop(p,None)
      for p in set(tripop): tt.pop(p,None)

#Take care of trigrams with duplicating bigrams (i.e. banker residential 
#brokerage and residential brokerage company, etc.): remove the one with the
#lower relevance score (or if equal, just keep one).
      monopop = []
      bipop = []
      tripop = []

      maxr = {}
      for term, rel in tt.items():
        k = term.split()
        c1 = k[0]+' '+k[1]
        c2 = k[1]+' '+k[2]
        if(maxr.get(c1,0) <= rel): maxr[c1] = term
        if(maxr.get(c2,0) <= rel): maxr[c2] = term
      for term, rel in tt.items():
        k = term.split()
        c1 = k[0]+' '+k[1]
        c2 = k[1]+' '+k[2]
        if(maxr.get(c1,0) != term and maxr.get(c2,0) != term): 
          tripop.append(term)

#Do some lemmatizing on the units making up bigrams and trigrams to get rid of
#similar words or bigrams contained in trigrams (unigrams contained in bigrams).
      lmt = wnl()
      for term, rel in tt.items():
        k = term.split()
        c1 = k[0]+' '+k[1]
        c2 = k[1]+' '+k[2]
        if(math.fabs(rel-tb.get(c1,1e5)) < 140. and rel > 70.): bipop.append(c1)
        if(math.fabs(rel-tb.get(c2,1e5)) < 140. and rel > 70.): bipop.append(c2)
        if(c1 == job_name or c2 == job_name): tripop.append(term)
        kl = [lmt.lemmatize(w) for w in k]
        if( (math.fabs(rel-tm.get(kl[0],rel))<140. or \
             math.fabs(rel-tm.get(kl[1],rel))<140. or \
             math.fabs(rel-tm.get(kl[2],rel))<140.) and rel > 34. ):
          monopop += kl
  
      for term, rel in tb.items():
        k = term.split()
        kl = [lmt.lemmatize(w) for w in k]
        if( (math.fabs(rel-tm.get(kl[0],rel))<70. or \
             math.fabs(rel-tm.get(kl[1],rel))<70.) and rel > 17. ):
          monopop += kl
    
#Do some stemming on unigrams to get rid of similar words.
      stemmer = nltk.PorterStemmer()
      for t1, r1 in tm.items():
        if(r1 < 17.): continue
        st1 = stemmer.stem(t1)
        for t2, r2 in tm.items():
          if(t1 == t2 or r2 < 17.): continue
          st2 = stemmer.stem(t2)
  
          if(r1 < r2): small = t1
          else: small = t2
          if( (st1 == st2) or (st1 == st2+"e") or (st1+"e" == st2) or \
              (st1[:-1] == st2) or (st1 == st2[:-1]) or \
              (st1[:-1] == st2[:-1]) ):
            monopop.append(small)
  
      for p in set(monopop): tm.pop(p,None)
      for p in set(bipop): tb.pop(p,None)
      for p in set(tripop): tt.pop(p,None)

      return tm, tb, tt