Example #1
0
def make_lookup():
    tags = []

    for doc in tagColl.find():
        for c in doc["concepts"]:
            tags.append(c["text"])
            
    tags = [t.lower() for t in list(set(tags))]

    lookup = {}

    for t in tags:
        lookup[t.lower()] = []
        lookup[stem(t.lower(), stemmer=PORTER)] = []
        
    for doc in tagColl.find():
        for c in doc["concepts"]:
            lookup[c["text"].lower()].append([str(doc["_id"]), c["relevance"]])
            lookup[stem(c["text"].lower(), stemmer=PORTER)].append([str(doc["_id"]), c["relevance"]])
            
    for k in lookup.keys():
        lookup[k] = sorted(lookup[k], key=lambda x: x[1])
        lookup[k] = [x[0] for x in lookup[k]]

    with open('hst_lookup.json', 'w') as outfile:
        json.dump(lookup, outfile)

    return lookup
Example #2
0
def roots_and_lemmas():

    print(stem('cars', PORTER))  #Root
    print(stem('cars', LEMMA))
    print(stem('studies', PORTER))  # Root
    print(stem('studies', LEMMA))

    text = "People who teach find teaching very rewarding."
    tokens = words(text)
    print(count(tokens, stopwords=True, stemmer=PORTER))
    print(count(tokens, stopwords=True, stemmer=LEMMA))
def features(message):
    singlegrams =  [i for i in message.split() if i not in stop]#Removingstopwords
    
    singlegramsrefined = []
    #Stemming the single words
    for k in singlegrams:
        r = stem(k, stemmer=LEMMA)
        singlegramsrefined.append(r)
    newmessage = " ".join(singlegramsrefined) 
    newmessage = re.sub("[^A-Za-z]", " ", newmessage)# Removing numbers
    newmessage = re.sub(r'[^\w]', ' ', newmessage)# Removing stopwords
    singlegrams= [i for i in newmessage.split()]

    singlegramsrefined2 = []
    
    for word in singlegrams:
        singlegramsrefined2.append(word)
        
    bigrams = ngrams(newmessage, n=2)#bigrams
    trigrams = ngrams(newmessage, n=3)#trigrams
    
    totalgrams = singlegramsrefined2 + bigrams + trigrams
    
    totalgrams = tuple(totalgrams)#tuple having single words, bigrams and trigrams
    return totalgrams
def features(message):
    #List of nltk stopwords
    stop = [u'i','diabetes','diabetic','type 2 diabetes','type 2', u'me', u'my', u'myself', u'we', u'our', u'ours', u'ourselves',
            u'you', u'your', u'yours', u'yourself', u'yourselves', u'he', u'him', u'his', u'himself', u'she', u'her', u'hers',
            u'herself', u'it', u'its', u'itself', u'they', u'them', u'their', u'theirs', u'themselves', u'what', u'which', u'who',
            u'whom', u'this', u'that', u'these', u'those', u'am', u'is', u'are', u'was', u'were', u'be', u'been', u'being', u'have',
            u'has', u'had', u'having', u'do', u'does', u'did', u'doing', u'a', u'an', u'the', u'and', u'but', u'if', u'or',
            u'because', u'as', u'until', u'while', u'of', u'at', u'by', u'for', u'with', u'about', u'against', u'between',
            u'into', u'through', u'during', u'before', u'after', u'above', u'below', u'to', u'from', u'up', u'down', u'in',
            u'out', u'on', u'off', u'over', u'under', u'again', u'further', u'then', u'once', u'here', u'there', u'when', u'where',
            u'why', u'how', u'all', u'any', u'both', u'each', u'few', u'more', u'most', u'other', u'some',
            u'such', u'no', u'nor', u'not', u'only', u'own', u'same', u'so', u'than', u'too', u'very', u's', u't', u'can',
            u'will', u'just', u'don', u'should', u'now','m']
    singlegrams =  [i for i in message.split() if i not in stop]#Removingstopwords
    
    singlegramsrefined = []
    #Stemming the single words
    for k in singlegrams:
        r = stem(k, stemmer=LEMMA)
        if r not in stop:
            singlegramsrefined.append(r)
    newmessage = " ".join(singlegramsrefined) 
    newmessage = re.sub("[^A-Za-z]", " ", newmessage)# Removing numbers
    newmessage = re.sub(r'[^\w]', ' ', newmessage)# Removing non alphanumerics
    singlegrams= [i for i in newmessage.split() if len(i) > 1]

    singlegramsrefined2 = []
    
    for word in singlegrams:
        singlegramsrefined2.append(word)
        
    bigrams = ngrams(newmessage, n=2)#bigrams
    trigrams = ngrams(newmessage, n=3)#trigrams
    v = parsetree(newmessage, lemmata=True)[0]
    v = [w.lemma for w in v if w.tag.startswith(('NN'))]
    singlewords = []
    for i in v:
        stopping = stop +[u'hour',u'husband',u'anything',u'thing',u'way',u'n',u'number',u'person',u'd',u'x',u'dose',u'drug',u'today',u'help',u'everyone',u'bed',u'mine',u'bed',u'issue',u'anyone',u'thank' ,u'test', u'eat',u'something',u'doc',u'time',u'c',u'luck',u'lb',u'dr',u'morning','t',u'pill',u'upset',u'take',u'couple',u'month',u'use',u'exercise',u'diet',u'lot',u'vision','taking',u've',u'time',u'month',u'level',u'body',u'diet',u'food',u'release', u'time', u'meal',u'glipizide',u'week',
                          'type','yr',u'symptom',u'cause',u'tablet',u'blood',u'feel',u'like',
                          u'made',u'bad',u'work',u'still',
                          u'got',u'twice',u'i',u'mg',u'm',u'day',
                          u'sugar',u'taking',u'doctor',u'get',u'year',
                          u'side',u'went',u'med',u'one',u'better',
                          u'effect',u'problyear',u'side',u'went',u'med',u'one',u'better',u'effect',u'problem',u'also']
        if i not in stopping:
            singlewords.append(i)
    bi = []
    for r in bigrams:
        if r not in [(u'year', u'now'),(u'also', u'take'),(u'doesn', u't') ,(u'take', u'food'),(u'taking', u'metformin'),(u'i', u'diagnosed'),(u'metformin', u'mg'),(u'empty', u'stomach'),(u'couldn', u't'),(u'blood', u'sugar'),(u'diet', u'exercise'),(u'mg', u'x'),(u'type', u'diabetes'),(u'side', u'effect'),(u'i', u'm'),(u'i', u've'),(u'twice', u'day'),
                     (u'a', u'c'),(u'don', u't'),(u'slow', u'release'),(u't', u'take'),(u't', u'take'),
                     (u'good', u'luck'),(u'didn', u't'),(u'mg', u'twice'),(u'take', u'metformin'),(u'time', u'day'),
                     (u'went', u'away'),(u'year', u'ago'),(u'much', u'better'),(u'extended', u'release'),(u'started', u'taking'),
                     (u'can', u't'),(u'anyone', u'else'),(u'month', u'ago'),(u'mg', u'day')]:
            bi.append(r)      
   
    
    totalgrams = singlewords + bi
    
    
    return totalgrams
Example #5
0
def stem_words(data):
    """
    Stem words to their base
    linguistic stem to remove redundancy
    """
    for val in data:
        val = stem(val, stemmer=PORTER)
    return data
Example #6
0
	def __iter__(self):
		for line in open(os.path.join(__location__, 'KeyVisCorpora', 'abstracts.txt'), 'rU'):
			line = unicode(line, errors='ignore')
			lowers = line.lower()
			tokenList = lowers.split()
			output = [stem(word, stemmer=LEMMA) for word in tokenList]
			#Assume there's one document per line, tokens separated by space
			yield dictionary.doc2bow([x.strip() for x in output])
Example #7
0
 def __iter__(self):
     for line in open(
             os.path.join(__location__, 'KeyVisCorpora', 'abstracts.txt'),
             'rU'):
         line = unicode(line, errors='ignore')
         lowers = line.lower()
         tokenList = lowers.split()
         output = [stem(word, stemmer=LEMMA) for word in tokenList]
         #Assume there's one document per line, tokens separated by space
         yield dictionary.doc2bow([x.strip() for x in output])
Example #8
0
def stem_words(words):
    """Stem words to their base linguistic stem to remove redundancy.

    Args:
        words (list): The list of words

    Returns:
        list: An updated word list with words stemmed.
    """
    return [stem(word, stemmer=PORTER) for word in words]
Example #9
0
def stem_words(words):
    """Stem words to their base linguistic stem to remove redundancy.

    Args:
        words (list): The list of words

    Returns:
        list: An updated word list with words stemmed.
    """
    return [stem(word, stemmer=PORTER) for word in words]
def featureExtractor(textMessage,countgrams):
    textMessage = textMessage.lower()
    #Function to remove stop words
    stopWords = [u'i','m', u'me', u'my', u'myself', u'we', u'our', u'ours', u'ourselves', u'you', u'your', u'yours', u'yourself',
                 u'yourselves', u'he', u'him', u'his', u'himself', u'she', u'her', u'hers', u'herself', u'it', u'its',
                 u'itself', u'they', u'them', u'their', u'theirs', u'themselves', u'what', u'which', u'who', u'whom',
                 u'this', u'that', u'these', u'those', u'am', u'is', u'are', u'was', u'were', u'be', u'been', u'being', u'have',
                 u'has', u'had', u'having', u'do', u'does', u'did', u'doing', u'a', u'an', u'the', u'and', u'but', u'if', u'or',
                 u'because', u'as', u'until', u'while', u'of', u'at', u'by', u'for', u'with', u'about', u'against', u'between',
                 u'into', u'through', u'during', u'before', u'after', u'above', u'below', u'to', u'from', u'up', u'down', u'in', u'out', u'on',
                 u'off', u'over', u'under', u'again', u'further', u'then', u'once', u'here', u'there', u'when', u'where', u'why', u'how',
                 u'all', u'any', u'both', u'each', u'few', u'more', u'most', u'other', u'some', u'such', u'no', u'nor', u'not', u'only',
                 u'own', u'same', u'so', u'than', u'too', u'very', u's', u't', u'can', u'will', u'just', u'don', u'should', u'now']
   
    
    avoidList1 = ['diabetes','type 2','diabetic']
    avoidList = stopWords + avoidList1
    #Removing these stop words and general cleaning
    singleGrams =  [i for i in textMessage.split() if i not in avoidList]
    singlegramsRefined = []

    #Stemming the words for normalization
    for k in singleGrams:
        r = stem(k, stemmer=LEMMA)
        singlegramsRefined.append(r)
    newMessage = " ".join(singlegramsRefined) 
    newMessage = re.sub("[^A-Za-z]", " ", newMessage)# Removing numbers
    newMessage = re.sub(r'[^\w]', ' ', newMessage)# Removing all non alphanumeric chars
    singleGrams= [i for i in newMessage.split()] #Again splitting to single grams


    singlegramsRefined2 = [word for word in singleGrams] #Keep this now because it works
    biGrams = ngrams(newMessage, n=2)# Generating bigrams
    triGrams = ngrams(newMessage, n=3)#Generating trigrams

    totalGramsrefined = []
    if countgrams == 1:
        
        totalGrams = singlegramsRefined2
        
        totalGramsrefined = [i for i in totalGrams]# We want only those features in the text data which is in the model

    elif countgrams == 2:
        totalGrams = singlegramsRefined2+biGrams
        
        totalGramsrefined = [i for i in totalGrams]

    elif countgrams == 3:
        totalGrams = singlegramsRefined2+biGrams + triGrams
        
        totalGramsrefined = [i for i in totalGrams]
        

    return totalGramsrefined
Example #11
0
    def _validate(self, clue: str, positiveWords: np.array,
                  negativeWords: np.array) -> bool:
        clue = clue.lower()

        invalidWords: np.array = np.append(
            self.previousClues, np.append(positiveWords, negativeWords))
        stemmedClue: str = stem(clue)
        singularClue: str = singularize(clue)
        pluralClue: str = pluralize(clue)

        if not clue.isalpha() or not clue.isascii() or set(
                "aeiouy").isdisjoint(clue) or not 2 <= len(clue) <= 12:
            return False

        for word in invalidWords:
            stemmedWord = stem(word)
            singularWord = singularize(word)
            pluralWord = pluralize(word)
            if clue in word or word in clue or stemmedClue in word or stemmedWord in clue or \
                    singularClue in word or singularWord in clue or pluralClue in word or pluralWord in clue:
                return False

        return True
 def visit(self, link, source=None):
     print('visited:', repr(link.url), 'from:', link.referrer)
     i = str(link).split('/')
     i = [stem(i[j]) for j in range(len(i))]
     i = '_'.join(str(e) for e in i)
     b = search('5g', i)
     Data = {}
     if not len(b) == 0:
         hash_object = hashlib.sha256(link.url)
         hex_dig = hash_object.hexdigest()
         Data['id'] = hex_dig
         Data['link'] = repr(link.url)
         #Data['source'] = FROM
         r = json.dumps(Data)
         loaded_r = json.loads(r)
         es.index(index='mining_links',
                  doc_type='mining_links',
                  id=i,
                  body=loaded_r)
         Data = {}
Example #13
0
 def make_additional_keywords( self ):
   '''unstemmed words not in stemmed list'''
   assert type(self.keywords_stemmed) == list
   if len( self.keywords_stemmed ) > 0:
     assert type(self.keywords_stemmed[0]) == tuple
   assert type(self.keywords_unstemmed) == list
   if len( self.keywords_unstemmed ) > 0:
     assert type(self.keywords_unstemmed[0]) == tuple
   ## make simple stemmed keyword list from (score, word) tuple
   temp_simple_stemmed = []
   for kw_tuple in self.keywords_stemmed:
     score = kw_tuple[0]; word = kw_tuple[1]
     temp_simple_stemmed.append( word )
   ## add any additional unstemmed keywords (whose stems aren't in temp_simple_stemmed )
   self.keywords_unstemmed_additional = []
   for kw_tuple in self.keywords_unstemmed:
     score = kw_tuple[0]; word = kw_tuple[1]
     if word not in temp_simple_stemmed:  # TODO: time using sets here instead
       if stem( word, stemmer=PORTER ) not in temp_simple_stemmed:
         self.keywords_unstemmed_additional.append( kw_tuple )
Example #14
0
    def handle_starttag(self, tag, attrs):
        #print(lien)

        for attr in attrs:

            a = attr[1]
            a = a.split('/')
            a = [stem(a[j]) for j in range(len(a))]
            #print(a)
            a = '_'.join(str(e) for e in a)

            for i in attr:

                l = search('src', i)
                b = search('path', i)
                b1 = search('5g', a)
                b2 = search('mob', a)
                b3 = search('imag', a)
                b4 = search('video', a)
                b5 = search('pdf', a)

                if not len(b1) == 0:

                    if not len(b5) == 0:

                        if not attr[1][2:] in Pdfs[lien]:
                            Pdfs[lien].append(attr[1][2:])

                    if not len(b3) == 0:
                        if not attr[1][2:] in Images[lien]:
                            Images[lien].append(attr[1][2:])

                    if not len(b4) == 0:
                        if not attr[1][2:] in Videos[lien]:
                            Videos[lien].append(attr[1][2:])

                if not len(l) == 0 and not len(b1) == 0:
                    if not attr[1][2:] in Images[lien]:
                        Images[lien].append(attr[1][2:])
def process_articles(articles, stoplist):
    print "Cleaning Articles: Special Characters, Stemming, Stopwords"

    remove_list = string.ascii_letters + string.digits

    cleanArticles = []

    for a in articles:
        # html entities
        a = gensim.utils.decode_htmlentities(a)

        # Remove Unicode
        temp = a.decode("utf-8")
        temp = temp.encode("ascii", errors="ignore")

        # Split
        temp = temp.split()
        cleanArticle = []
        for w in temp:
            # Lowercase
            w = w.lower()

            if w in stoplist:
                continue

            # Remove Special Chars
            w = "".join([l for l in w if l in remove_list])

            if w != "":
                w = stem(w, stemmer=LEMMA)
                cleanArticle.append(w)

        cleanArticles.append(cleanArticle)

    print "Cleaned Articles"

    return cleanArticles
Example #16
0
 def test_stem(self):
     # Assert stem with PORTER, LEMMA and pattern.en.Word.
     s = "WOLVES"
     v1 = vector.stem(s, stemmer=None)
     v2 = vector.stem(s, stemmer=vector.PORTER)
     v3 = vector.stem(s, stemmer=vector.LEMMA)
     v4 = vector.stem(s, stemmer=lambda w: "wolf*")
     v5 = vector.stem(Word(None, s, lemma=u"wolf*"), stemmer=vector.LEMMA)
     v6 = vector.stem(Word(None, s, type="NNS"), stemmer=vector.LEMMA)
     self.assertEqual(v1, "wolves")
     self.assertEqual(v2, "wolv")
     self.assertEqual(v3, "wolf")
     self.assertEqual(v4, "wolf*")
     self.assertEqual(v5, "wolf*")
     self.assertEqual(v6, "wolf")
     # Assert unicode output.
     self.assertTrue(isinstance(v1, unicode))
     self.assertTrue(isinstance(v2, unicode))
     self.assertTrue(isinstance(v3, unicode))
     self.assertTrue(isinstance(v4, unicode))
     self.assertTrue(isinstance(v5, unicode))
     self.assertTrue(isinstance(v6, unicode))
     print("pattern.vector.stem()")
Example #17
0
 def test_stem(self):
     # Assert stem with PORTER, LEMMA and pattern.en.Word.
     s = "WOLVES"
     v1 = vector.stem(s, stemmer=None)
     v2 = vector.stem(s, stemmer=vector.PORTER)
     v3 = vector.stem(s, stemmer=vector.LEMMA)
     v4 = vector.stem(s, stemmer=lambda w: "wolf*")
     v5 = vector.stem(Word(None, s, lemma=u"wolf*"), stemmer=vector.LEMMA)
     v6 = vector.stem(Word(None, s, type="NNS"), stemmer=vector.LEMMA)
     self.assertEqual(v1, "wolves")
     self.assertEqual(v2, "wolv")
     self.assertEqual(v3, "wolf")
     self.assertEqual(v4, "wolf*")
     self.assertEqual(v5, "wolf*")
     self.assertEqual(v6, "wolf")
     # Assert unicode output.
     self.assertTrue(isinstance(v1, unicode))
     self.assertTrue(isinstance(v2, unicode))
     self.assertTrue(isinstance(v3, unicode))
     self.assertTrue(isinstance(v4, unicode))
     self.assertTrue(isinstance(v5, unicode))
     self.assertTrue(isinstance(v6, unicode))
     print "pattern.vector.stem()"
Example #18
0
    def parse_message(self, text, usernick, channel):
        if channel != self.chan:
            userOrFalse = usernick
        else:
            userOrFalse = False

        words = re.findall(r"\b[\w]+\b", text.lower())
        tokens = text.lower().split()

        original_words = words[:]

        try:
            words.remove(self.nick)
        except:
            pass

        try:
            words.remove('hst')
        except:
            pass

        try:
            tree = parsetree(' '.join(words))
            firstNoun = match('NN|NNS|NNP|NNPS', tree)
        except:
            firstNoun = None

        # print original_words
        if self.nick in original_words:

            if set(words) & set(['help', 'commands']):
                commandsTemp = Template(self.commands)
                self.send_msg(
                    commandsTemp.substitute(usernick=usernick, botnick=self.nick),
                    channel=userOrFalse
                )
            elif '?' in text or (set(words) & set(['who', 'where', 'when', 'what', 'why', 'how'])):
                fileObj = open('weird_grammar.json', 'r')
                jsonObj = json.load(fileObj)
                fileObj.close()
                s = sentiment(text)[0]
                if s > 0:
                    print s * 2500 + 1
                    self.send_msg(
                        make_polar(jsonObj, int(s * 2500 + 1)),
                        channel=userOrFalse
                    )
                else:
                    print s * 2500 - 1
                    self.send_msg(
                        make_polar(jsonObj, int(s * -2500 - 1), sent=0),
                        channel=userOrFalse
                    )

            elif firstNoun is not None:
                print firstNoun.string.replace('_', ' ')
                s = sentiment(text)[0]
                sentences = sorted(
                    pf_sentences(abs(s*1000+3), firstNoun.string.replace('_', ' ')),
                    key = lambda x: sentiment(x)[0]
                )

                if s > 0:
                    # print s * 2500 + 1
                    self.send_msg(
                        ' '.join(sentences[-3:]),
                        channel=userOrFalse
                    )
                else:
                    # print s * 2500 - 1
                    self.send_msg(
                        ' '.join(sentences[:3]),
                        channel=userOrFalse
                    )
            else:
                snarkTemp = Template(rc(self.snarklist))
                self.send_msg(
                    snarkTemp.substitute(usernick=usernick, botnick=self.nick),
                    channel=userOrFalse
                )

        if tokens[0] == '.seen':
            tgt_user = tokens[1]
            if tgt_user in self.seen_dict:
                last_time, last_msg = self.seen_dict[tgt_user]
                self.send_msg(
                    "%s: %s last seen on %s saying: %s" % (usernick, tgt_user, last_time, last_msg),
                    channel=userOrFalse
                )
            else:
                self.send_msg(
                    "%s: I haven't seen %s." % (usernick, tgt_user),
                    channel=userOrFalse
                )

        elif tokens[0] == '.tell':
            tgt_user = tokens[1]
            if not tgt_user in self.tells_dict:
                self.tells_dict[tgt_user] = []
            self.tells_dict[tgt_user].append((usernick, ' '.join(tokens[2:])))
            self.send_msg(
                "%s: Ok, I'll tell %s that for you." % (usernick, tgt_user),
                channel=userOrFalse
            )
            with open('tells_dict.json', 'w') as outfile:
                json.dump(self.tells_dict, outfile)

        elif tokens[0] == '.showtells':
            if not usernick in self.tells_dict or not self.tells_dict[usernick]:
                self.send_msg("%s: I have nothing for you." % usernick, channel=usernick)
            else:
                while self.tells_dict[usernick]:
                    src_user, tell_msg = self.tells_dict[usernick].pop()
                    self.send_msg("%s said: %s" % (src_user, tell_msg), channel=usernick)
            with open('tells_dict.json', 'w') as outfile:
                json.dump(self.tells_dict, outfile)

        elif tokens[0] == '.gif':
            gif_url = get_gif(tokens[1:])
            self.send_msg("%s: %s" % (usernick, gif_url), channel=userOrFalse)

        elif tokens[0] == '.wiki':
            try:
                wiki_url, wiki_text = get_wiki_article(tokens[1:])
            except:
                self.send_msg(
                    "%s: I'm sorry, but something went wrong!" % usernick,
                    channel=userOrFalse
                )
            else:
                if wiki_text:
                    safe_wiki_text = ''.join(list(wiki_text)[:300]).replace('\n', ' ') + '...'
                    safe_wiki_text = safe_wiki_text.encode('ascii', 'ignore')
                    self.send_msg(
                        "%s: %s | %s" % (usernick, wiki_url, safe_wiki_text),
                        channel=userOrFalse
                    )
                else:
                    self.send_msg(
                        "%s: I'm sorry, but something went wrong!" % usernick,
                        channel=userOrFalse
                    )
                    
        elif tokens[0] == '.yt':
            try:
                result = youtube_search(tokens[1:])
                result = map(lambda x: x.encode('ascii', 'ignore'), result)
                title, desc, vidId = result
                self.send_msg(
                    "%s: %s | %s | https://www.youtube.com/watch?v=%s" % (usernick, title, desc, vidId),
                    channel=userOrFalse
                )
            except:
                self.send_msg(
                    "%s: I'm sorry, but something went wrong!" % usernick,
                    channel=userOrFalse
                )

        elif tokens[0] == '.hst':
            # self.send_msg('/nick drgonzo')
            if firstNoun is not None:
                lookupFile = open("hst_lookup.json", 'r')
                lookup = json.load( lookupFile )
                lookupFile.close()

                nounStem = stem(firstNoun, stemmer=PORTER)
                idHash = None

                print nounStem

                try:
                    # switch to descending
                    idHash = rc(lookup[nounStem])
                    print idHash
                except KeyError:
                    pass

                try:
                    idHash = rc(lookup[firstNoun])
                    print idHash
                except KeyError:
                    pass
                
                if idHash is not None:
                    bookFile = open("hst_text.json", 'r')
                    books = json.load( bookFile )
                    bookFile.close()

                    text = books[idHash].encode('ascii', 'ignore')
                    # print text

                    self.send_msg("%s: %s"%(usernick,text), channel=userOrFalse)

                else:
                    self.send_msg("%s: Can't say I know it." % usernick, channel=userOrFalse)

            else:
                self.send_msg("%s: Nothing to say about that." % usernick, channel=userOrFalse)
            # self.send_msg('/nick itpbot')


        if "ross" in words:
            self.send_msg("%s: I hope you're not speaking ill of my creator." % usernick, channel=userOrFalse)

        if "itp" in words:
            message = rand_itp_acronym()
            self.send_msg(message, channel=userOrFalse)
Example #19
0
 def stem(self, word):
     return stem(word, stemmer=PORTER)
Example #20
0
def token_hypernyms(token, recursive, depth):
    '''Stem each token using default stemmer from the pattern library (PORTER?)'''
    for synset in wordnet.synsets(stem(token)):
        for hypernym in synset.hypernyms(recursive, depth):
            for sense in hypernym.senses:
                yield sense
def featureExtractor(textMessage,countgrams):
    textMessage = textMessage.lower()
    #Function to remove stop words
    stopWords = [u'i','m', u'me', u'my', u'myself', u'we', u'our', u'ours', u'ourselves', u'you', u'your', u'yours', u'yourself', u'yourselves', u'he', u'him', u'his', u'himself', u'she', u'her', u'hers', u'herself', u'it', u'its', u'itself', u'they', u'them', u'their', u'theirs', u'themselves', u'what', u'which', u'who', u'whom', u'this', u'that', u'these', u'those', u'am', u'is', u'are', u'was', u'were', u'be', u'been', u'being', u'have', u'has', u'had', u'having', u'do', u'does', u'did', u'doing', u'a', u'an', u'the', u'and', u'but', u'if', u'or', u'because', u'as', u'until', u'while', u'of', u'at', u'by', u'for', u'with', u'about', u'against', u'between', u'into', u'through', u'during', u'before', u'after', u'above', u'below', u'to', u'from', u'up', u'down', u'in', u'out', u'on', u'off', u'over', u'under', u'again', u'further', u'then', u'once', u'here', u'there', u'when', u'where', u'why', u'how', u'all', u'any', u'both', u'each', u'few', u'more', u'most', u'other', u'some', u'such', u'no', u'nor', u'not', u'only', u'own', u'same', u'so', u'than', u'too', u'very', u's', u't', u'can', u'will', u'just', u'don', u'should', u'now']
    avoidList1 = ["actos", "pioglitazone hydrochloride", "pioglitazone",  "glustin", "glizone", "pioz", "zactos"]

    avoidList2 = ["medformin","metfornin","metforin","glucophage", "metformin", "glucophage xr", "metformin hydrochloride","carbophage sr", "riomet", "fortamet", "glumetza", "obimet", "gluformin", "dianben", "diabex", "diaformin", "siofor","metfogamma", "riomet","diformin","metformi","metphormin","metaforming","metfirman","metoformin","metfomin"]

    avoidList3 = ["byetta", "bydureon", "exenatide","byetta"]

    avoidList4 = ["victosa","victoza", "liraglutide", "saxenda","victoza"]

    avoidList5 = ["invokana", "invokana","canagliflozin"]

    avoidList6 = ["avandia", "rosiglitazone"]

    avoidList7 = ["insu","humalog","levimir","novolog","insuline","insulin glargine","insulins","lantus", "toujeo", "abasaglar", "basaglar","insulin","insulins","levamir","levemir"]

    avoidList8 = ["sitagliptin", "janumet", "januvia", "juvisync","junuvia","januvia","sitaglipton"]

    avoidList9 = ["amaryl", "glimepiride", "gleam", "k-glim-1", "glucoryl",  "glimpid", "glimy","ameryl"]
    
    avoidList10 = ['diabetes','type 2','diabetic']
    avoidList = stopWords + avoidList1 + avoidList2 + avoidList3 + avoidList4 + avoidList5 + avoidList6 + avoidList7 + avoidList8 + avoidList9 + avoidList10
    #Removing these stop words and general cleaning
    singleGrams =  [i for i in textMessage.split() if i not in avoidList]
    singlegramsRefined = []

    #Stemming the words for normalization
    for k in singleGrams:
        r = stem(k, stemmer=LEMMA)
        singlegramsRefined.append(r)
    newMessage = " ".join(singlegramsRefined) 
    newMessage = re.sub("[^A-Za-z]", " ", newMessage)# Removing numbers
    newMessage = re.sub(r'[^\w]', ' ', newMessage)# Removing all non alphanumeric chars
    singleGrams= [i for i in newMessage.split()] #Again splitting to single grams


    singlegramsRefined2 = [word for word in singleGrams] #Keep this now because it works
    biGrams = ngrams(newMessage, n=2)# Generating bigrams
    triGrams = ngrams(newMessage, n=3)#Generating trigrams
    listModelfeatures = modelFeatures()
    totalGramsrefined = []
    if countgrams == 1:
        
        totalGrams = singlegramsRefined2
        
        totalGramsrefined = [i for i in totalGrams if i in listModelfeatures]# We want only those features in the text data which is in the model

    elif countgrams == 2:
        totalGrams = singlegramsRefined2+biGrams
        
        totalGramsrefined = [i for i in totalGrams if i in listModelfeatures]

    elif countgrams == 3:
        totalGrams = singlegramsRefined2+biGrams + triGrams
        
        totalGramsrefined = [i for i in totalGrams if i in listModelfeatures]
        

    return totalGramsrefined
Example #22
0
def token_hypernyms(token, recursive, depth):
    '''Stem each token using default stemmer from the pattern library (PORTER?)'''
    for synset in wordnet.synsets(stem(token)):
        for hypernym in synset.hypernyms(recursive, depth):
            for sense in hypernym.senses:
                yield sense
                       filter=lambda w: w.strip("'").isalnum(),
                       punctuation='.,;:!?()[]{}`'
                       '\"@#$^&*+-|=~_')
    # returns a list of words by splitting the string on spaces.
    freq_dic = count(  # takes a list of words and returns a dictionary of (word, count)-items.
        words=words_list,
        top=None,  # Filter words not in the top most frequent (int).
        threshold=0,  # Filter words whose count <= threshold.
        stemmer=None,  # PORTER | LEMMA | function | None
        exclude=[],  # Filter words in the exclude list.
        stopwords=False,  # Include stop words?
        language='en')  # en, es, de, fr, it, nl
for k, v in freq_dic.iteritems():
    print k, v
# stop words and stemming
print stem('spies', stemmer=PORTER)
print stem('spies', stemmer=LEMMA)
s = 'The black cat was spying on the white cat.'
print count(words(s), stemmer=PORTER)
print count(words(s), stemmer=LEMMA)
s = 'The black cat was spying on the white cat.'
s = Sentence(parse(s))
print count(s, stemmer=LEMMA)
# character n-grams
print chngrams('The cat sat on the mat.'.lower(), n=3)
# document
text = "The shuttle Discovery, already delayed three times by technical problems and bad weather, was grounded again" \
    "Friday, this time by a potentially dangerous gaseous hydrogen leak in a vent line attached to the shipʼs" \
    "external tank. The Discovery was initially scheduled to make its 39th and final flight last Monday, bearing" \
    "fresh supplies and an intelligent robot for the International Space Station. But complications delayed the" \
    "flight from Monday to Friday, when the hydrogen leak led NASA to conclude that the shuttle would not be ready" \
Example #24
0
	def stemmer(self,word):
		#stemmer=None, stemmer=LEMMA, stemmer=PORTER
		print stem(word,stemmer=PORTER)
Example #25
0
    os.path.join(os.getcwd(), os.path.dirname(__file__)))
"""(1) Read from abstracts.txt populated by corporaReader.py"""
print "Reading abstracts.txt ..."
abstractList = []
with open(os.path.join(__location__, 'KeyVisCorpora', 'abstracts.txt'),
          'rU') as inputFile:
    document = inputFile.readlines()
    for abstract in document:
        abstractList.append(abstract)
print "Finished reading  %i abstracts.txt!" % len(abstractList)
"""(2) Create token list from abstractList; unicode encoding"""
print "Creating token list ..."
abstractTokens = [[
    unicode(word, "utf-8", errors="ignore") for word in line.split()
] for line in abstractList]
abstractTokens = [[stem(word, stemmer=LEMMA) for word in line]
                  for line in abstractTokens]
"""Build dictionary and do dictionary pre-processing"""
print "Building dicitonary ..."
dictionary = corpora.Dictionary(abstractTokens)
#remove stop words and words that appear only once
stopwords = stopwords.words('english')
exclusionlist = ['-', 'se', 'h', 'd',
                 'iee']  #manually populated; add to this if necessary
stopwords = stopwords + exclusionlist
stop_ids = [
    dictionary.token2id[stopword] for stopword in stopwords
    if stopword in dictionary.token2id
]
once_ids = [
    tokenid for tokenid, docfreq in dictionary.dfs.iteritems() if docfreq == 1
Example #26
0
def extract_overlap(blah, lookup):
    # Parse input text...

    s = parsetree(blah, relations=True, lemmata=True)

    keywords = []
    raw = []
    adj = []
    all_tokens = []

    for sentence in s:
        for chunk in sentence.subjects + sentence.objects:
            h = chunk.head
            if h.type != "PRP":
                keywords.append(h.string)
                raw.append(chunk.string)
        for word in sentence.words:
            if word.type == "JJ":
                adj.append(word.string)
            all_tokens.append(word.string)

    # Make candidate lists...

    key_cand = []
    syn_key_cand = []
    stem_key_cand = []

    adj_cand = []
    syn_adj_cand = []
    stem_adj_cand = []

    all_cand = []
    syn_all_cand = []
    stem_all_cand = []

    last_resort = []

    tags = [k.lower() for k in lookup.keys()]
    
    for word in keywords:
        if word.lower() in tags:
            key_cand.append(word.lower())
        for synword in synonymous(word):
            if synword.lower() in tags:
                syn_key_cand.append(synword.lower())
        if stem(word.lower(), stemmer=PORTER) in tags:
            stem_key_cand.append(stem(word.lower(), stemmer=PORTER))


    for word in adj:
        if word.lower() in tags:
            adj_cand.append(word.lower())
        for synword in synonymous(word):
            if synword.lower() in tags:
                syn_adj_cand.append(synword.lower())
        if stem(word.lower(), stemmer=PORTER) in tags:
            stem_adj_cand.append(stem(word.lower(), stemmer=PORTER))
            
    for word in all_tokens:
        if word.lower() in tags:
            all_cand.append(word.lower())
        for synword in synonymous(word):
            if synword.lower() in tags:
                syn_all_cand.append(synword.lower())
        if stem(word.lower(), stemmer=PORTER) in tags:
            stem_all_cand.append(stem(word.lower(), stemmer=PORTER))
                
    for k in tags:
        if k.lower() in blah.lower():
            last_resort.append(k)

    cand = key_cand + stem_key_cand + adj_cand + stem_adj_cand + all_cand + stem_all_cand + syn_key_cand + syn_adj_cand + syn_all_cand + last_resort

    if cand == None:
        cand = []

    return cand
Example #27
0
 def lemmatize(self, word):
     return stem(word, stemmer=LEMMA)
Example #28
0

"""(1) Read from abstracts.txt populated by corporaReader.py"""
print "Reading abstracts.txt ..."
abstractList = []
with open(os.path.join(__location__, 'KeyVisCorpora', 'abstracts.txt'), 'rU') as inputFile:
	document = inputFile.readlines()
	for abstract in document:
		abstractList.append(abstract)
print "Finished reading  %i abstracts.txt!" % len(abstractList)


"""(2) Create token list from abstractList; unicode encoding"""
print "Creating token list ..."
abstractTokens = [[unicode(word, "utf-8", errors = "ignore") for word in line.split()] for line in abstractList]
abstractTokens = [[stem(word, stemmer=LEMMA) for word in line] for line in abstractTokens]


"""Build dictionary and do dictionary pre-processing"""
print "Building dicitonary ..."
dictionary = corpora.Dictionary(abstractTokens)
#remove stop words and words that appear only once
stopwords = stopwords.words('english')
exclusionlist = ['-', 'se', 'h', 'd', 'iee'] #manually populated; add to this if necessary
stopwords = stopwords + exclusionlist
stop_ids = [dictionary.token2id[stopword] for stopword in stopwords if stopword in dictionary.token2id]
once_ids = [tokenid for tokenid, docfreq in dictionary.dfs.iteritems() if docfreq ==1]
dictionary.filter_tokens(stop_ids) #remove them from the dictionary "dictionary.filter_tokens(stop_ids + once_ids)"
dictionary.filter_tokens(once_ids) #remove terms that only occur once
dictionary.compactify() # remove gaps in id sequence after words that were removed
dictionary.save(os.path.join(__location__, 'data/KeyVis.dict')) #store dictionary for future reference