Ejemplo n.º 1
0
def Granularity(sentenceArray):
    for sentence in sentenceArray:
        # print(sentence)
        try:

            stemmer = EnglishStemmer()
            sentence = re.sub(r'\#.*?$', '', sentence)
            sentence = re.sub(r'\#.*? ', '', sentence)
            sentence = re.sub(r'\@.*?$', '', sentence)
            sentence = re.sub(r'\@.*? ', '', sentence)
            sentence = re.sub(r'pic.twitter.*?$', '', sentence)
            sentence = re.sub(r'pic.twitter.*? ', '', sentence)
            sentence = re.sub(r'\'m', ' am', sentence)
            sentence = re.sub(r'\'d', ' would', sentence)
            sentence = re.sub(r'\'ll', ' will', sentence)
            sentence = re.sub(r'\&', 'and', sentence)
            sentence = re.sub(r'don\'t', 'do not', sentence)

            data = stemmer.stem(sentence)
            print(data)
            from nltk.corpus import stopwords

            sentence = str(data)
            stop = stopwords.words('english')
            final = [i for i in sentence.split() if i not in stop]
            finalstring = ' '.join(final)
            os.system("printf \"" + str(finalstring) + "\n\">> stemstop/" + word)
        except Exception as e:
            print(e)
Ejemplo n.º 2
0
def getAllStemEntities(entities):
    st = EnglishStemmer()
    q = [",", ".", "!", "?", ":", ";"]
    tmp = []
    sourceEntities = [x for x in entities if len(x) > 0]
    np.random.shuffle(entities)

    for i in xrange(len(entities)):
        if len(entities[i]) == 0:
            continue
        if i % 1000 == 0:
            print i
        entities[i] = entities[i].lower()
        entities[i] = entities[i].replace(" - ", " \u2013 ", entities[i].count(" - "))
        entities[i] = entities[i].replace(" -", " \u2013", entities[i].count(" -"))
        entities[i] = entities[i].replace("- ", "\u2013 ", entities[i].count("- "))
        entities[i] = entities[i].replace("-", " - ", entities[i].count("-"))
        entities[i] = entities[i].replace(")", " )", entities[i].count(")"))
        entities[i] = entities[i].replace("(", "( ", entities[i].count("("))
        entities[i] = entities[i].replace("\u0027", " \u0027", entities.count("\u0027"))
        for w in q:
            entities[i] = entities[i].replace(w, " " + w, entities[i].count(w))
        word = entities[i].split(" ")
        s = ""
        for w in word:
            s += st.stem(unicode(w)) + " "
        tmp.append(s[:-1])
        if len(tmp) > 50:
            break

    return tmp, entities[: len(tmp)]
Ejemplo n.º 3
0
def getAllStemEntities(entities):
    st = EnglishStemmer()
    q = [',', '.', '!', '?', ':', ';']
    tmp = []
    sourceEntities = [x for x in entities if len(x)>0]
    np.random.shuffle(entities)

    for i in xrange(len(entities)):
        if len(entities[i]) == 0:
            continue
        if i % 1000 == 0:
            print i
        entities[i] = entities[i].lower()
        entities[i] = entities[i].replace(' - ', ' \u2013 ', entities[i].count(' - '))
        entities[i] = entities[i].replace(' -', ' \u2013', entities[i].count(' -'))
        entities[i] = entities[i].replace('- ', '\u2013 ', entities[i].count('- '))
        entities[i] = entities[i].replace('-', ' - ', entities[i].count('-'))
        entities[i] = entities[i].replace(')', ' )', entities[i].count(')'))
        entities[i] = entities[i].replace('(', '( ', entities[i].count('('))
        entities[i] = entities[i].replace('\u0027', ' \u0027', entities.count('\u0027'))
        for w in q:
            entities[i]=entities[i].replace(w, ' '+w, entities[i].count(w))
        word = entities[i].split(' ')
        s = ''
        for w in word:
            s  += st.stem(unicode(w)) + ' '
        tmp.append(s[:-1])
        if len(tmp) > 50:
            break

    return tmp, entities[:len(tmp)]
Ejemplo n.º 4
0
Archivo: parse.py Proyecto: ccr122/ccr
def str_to_dict(s):
    '''
    creates dictionary of words and counts
    input:  s string
    output: dictionary {word: count}
    '''
    s = s.encode('ascii','ignore')
    s = str(s)
    word_dict = {}
    l = re.findall(WORDRE, s)
    for w in l:
        w = w.lower()               # make all letters lowercase 
        
        if w[0] == "'":             # remove single quotes from beginning/
            w = w[1:]               # end of words in l
        elif w[-1] == "'":
            w = w[:-1]
        
        w = EnglishStemmer().stem(w)        # stems non-noun/verbs 
        w = w.encode('ascii','ignore')
        
        if w != '':
            if w not in word_dict:      # build dictionary
                word_dict[w] = 1
            else:
                word_dict[w] += 1

    return word_dict
Ejemplo n.º 5
0
def query(word):
    db = MySQLdb.connect("127.0.0.1","dizing","ynr3","dizing" )
    cursor=db.cursor()
    snowball_stemmer = EnglishStemmer()
    stem2 = snowball_stemmer.stem(word)
    cursor.execute("SELECT * FROM words WHERE original=%s OR stem1=%s OR stem2=%s", (word,word,stem2))
    rows = cursor.fetchall()
    words1 = dict()
    words2 = dict()
    for row in rows:
        if row[1] == word or row[3]==word:
            words1[word] = row[0]
        else:
            words2[word] = row[0]
    scenes1 = []
    scenes2 = []
    for (i,words_dict) in [(1,words1), (2,words2)]:
        wids = words_dict.values()
        for wid in wids:
            sql = "SELECT s.sentence, s.start, s.stop, s.ready, m.title FROM scene AS s, words_scenes AS ws, movie as m " + \
                           "WHERE ws.wid=%d AND ws.sid=s.sid AND s.mid = m.mid" % int(wid)
            # print sql
            cursor.execute(sql)
            rows = cursor.fetchall()
            if (i==1): scenes1 += rows
            else: scenes2 += rows
    print scenes1
    print scenes2
    return scenes1 + scenes2
    db.close()
Ejemplo n.º 6
0
 def _execute(self):
     
     corpus = mongoExtractText(self.name)
     stemmer = EnglishStemmer()
     for item in corpus:
         line = item.replace(',', ' ')
         stemmed_line = stemmer.stem(line)
         self.sentiment.append((sentiment.sentiment(stemmed_line), stemmed_line))
Ejemplo n.º 7
0
def stem_word(word):
    """
    Stem words
    :param word: (str) text word
    :returns: stemmed word
    """
    stemmer = EnglishStemmer()
    return stemmer.stem(word)
Ejemplo n.º 8
0
def as_eng_postagged_doc(doc):
    '''Uses nltk default tagger.'''
    tags    = [t for _, t in nltk.pos_tag(list(doc.word))]
    stemmer = EnglishStemmer()
    lemmata = [stemmer.stem(w) for w in list(doc.word)]
    doc['pos']   = Series(tags)
    doc['lemma'] = Series(lemmata)
    return doc
Ejemplo n.º 9
0
 def use_snowball_stemmer(self,word):
     """
     return stemmed words used snowball algorithm
     :param word:
     :return:
     """
     englishStemmer=EnglishStemmer()
     stemmed_word= englishStemmer.stem(word)
     return stemmed_word
Ejemplo n.º 10
0
def getLemmatizerInfo(pathArticle):

    data = open(pathArticle, "r")
    text1 = data.read().decode('utf-8')

    sourceText = text1

    links1 = []
    l = 0
    for q in text1.split():
        if q == '\ufeff':
            continue
        links1.append([text1.find(q,l), q])
        l = len(q) + 1 + text1.find(q,l)

    text1 = text1.replace(' - ', ' \u2013 ', text1.count(' - '))
    text1 = text1.replace(' -', ' \u2013', text1.count(' -'))
    text1 = text1.replace('- ', '\u2013 ', text1.count('- '))
    text1 = text1.replace('-', ' - ', text1.count('-'))
    text1 = text1.replace('(', '( ', text1.count('('))
    text1 = text1.replace(')', ' )', text1.count(')'))
    text1 = text1.replace(' \u0027', ' \u301E', text1.count(' \u0027'))
    text1 = text1.replace('\u0027', ' \u0027', text1.count('\u0027'))
    text1 = text1.split()
    if text1[0] == u'\ufeff':
        text1=text1[1:]
    text = []
    for word in text1:
        text2 = []
        if len(word) == 0:
            continue
        while word[len(word)-1] in [',','.','!','?',':',';']:
            text2.append(word[len(word)-1])
            word = word[:-1]
            if len(word) == 0:
                break
        text.append(word)
        for i in range(len(text2)-1, -1,-1):
            text.append(text2[i])

    out = ''

    st = EnglishStemmer()

    l = 0
    links = []
    for word in text:
        if isOk(word):
            q = st.stem(word) + ' '
        else:
            q = word + ' '
        out += q.lower()
        links.append([l, q])
        l += len(q)
    return out, links, links1, sourceText
Ejemplo n.º 11
0
def getLemmatizerInfo(pathArticle):

    data = open(pathArticle, "r")
    text1 = data.read().decode("utf-8")

    sourceText = text1

    links1 = []
    l = 0
    for q in text1.split():
        if q == "\ufeff":
            continue
        links1.append([text1.find(q, l), q])
        l = len(q) + 1 + text1.find(q, l)

    text1 = text1.replace(" - ", " \u2013 ", text1.count(" - "))
    text1 = text1.replace(" -", " \u2013", text1.count(" -"))
    text1 = text1.replace("- ", "\u2013 ", text1.count("- "))
    text1 = text1.replace("-", " - ", text1.count("-"))
    text1 = text1.replace("(", "( ", text1.count("("))
    text1 = text1.replace(")", " )", text1.count(")"))
    text1 = text1.replace(" \u0027", " \u301E", text1.count(" \u0027"))
    text1 = text1.replace("\u0027", " \u0027", text1.count("\u0027"))
    text1 = text1.split()
    if text1[0] == u"\ufeff":
        text1 = text1[1:]
    text = []
    for word in text1:
        text2 = []
        if len(word) == 0:
            continue
        while word[len(word) - 1] in [",", ".", "!", "?", ":", ";"]:
            text2.append(word[len(word) - 1])
            word = word[:-1]
            if len(word) == 0:
                break
        text.append(word)
        for i in range(len(text2) - 1, -1, -1):
            text.append(text2[i])

    out = ""

    st = EnglishStemmer()

    l = 0
    links = []
    for word in text:
        if isOk(word):
            q = st.stem(word) + " "
        else:
            q = word + " "
        out += q.lower()
        links.append([l, q])
        l += len(q)
    return out, links, links1, sourceText
Ejemplo n.º 12
0
def stemming(tweet):
    tweets = tweet.split()
    wrdStemmer = EnglishStemmer()
    stemTweet =[]
    try:
        for tweet in tweets:
            tweet = wrdStemmer.stem(tweet)
            stemTweet.append(tweet)
    except:
        print("Error: Stemming")
    return " ".join(stemTweet)
Ejemplo n.º 13
0
def fix_lemma_problem(pred_scores, targets, space):
    from nltk.stem.snowball import EnglishStemmer
    es = EnglishStemmer()
    r = pred_scores.copy()
    lemmas = np.array([es.stem(v) for v in space.vocab])
    for i, t in enumerate(targets):
        g = es.stem(space.vocab[t])
        mask = (lemmas == g)
        #print space.vocab[t], np.sum(mask)
        r[i][mask] = -1e9
        #print r[i][mask]
    return r
Ejemplo n.º 14
0
def main(fname):
  e = EnglishStemmer()

  n, a = 0, 0
  for line in open(sys.argv[1]):
    title, body, tags, creationdate, acceptedanswerid, score, viewcount = eval(line)

    # Process text into tokens
    html_tags = RX_OPEN_TAGS.findall(body)
    body = RX_TAGS.sub("",body)
    print " ".join(e.stem(s) for s in RX_NONWORD.split(body))
    M = bayes.NaiveLearner(adjust_threshold=True, name="Adjusted Naive Bayes")
Ejemplo n.º 15
0
def stemmed(text, snowball=False):
    """Returns stemmed text
    """
    if snowball:
        st = EnglishStemmer()
    else:
        st = PorterStemmer()
    words = wordpunct_tokenize(text)
    words = [st.stem(w) for w in words]
    text = ' '.join(words)

    return text
def get_stemmed_keywords(keywords):

  stemmer = EnglishStemmer()
  stemmed_keywords = list(keywords)
  # split into list of list
  stemmed_keywords = [keyword.split() for keyword in stemmed_keywords]
  # stem individual words
  stemmed_keywords = [list(stemmer.stem(word) for word in keyword) for keyword in stemmed_keywords]
  # list of words to string
  stemmed_keywords = [' '.join(keyword).encode('ascii') for keyword in stemmed_keywords]

  return stemmed_keywords
Ejemplo n.º 17
0
def similarity_score(word1, word2):
    """ see sections 2.3 and 2.4 of http://dx.doi.org.ezp-prod1.hul.harvard.edu/10.1109/TKDE.2003.1209005
    :type word1: string
    :type word2: string
    :return: float: between 0 and 1; similarity between two given words
    """
    stemmer = EnglishStemmer()
    if stemmer.stem(word1) == stemmer.stem(word2):
        return 1
    alpha = 0.2
    beta = 0.6
    l, h = get_path_length_and_subsumer_height(word1, word2)
    return exp((-1)*alpha*l)*((exp(beta*h)-exp((-1)*beta*h))/(exp(beta*h)+exp((-1)*beta*h)))
Ejemplo n.º 18
0
def normalize_tags():
    cursor.execute('SELECT app_id, tag, times FROM tag_app_rel;')
    all_tag_data = defaultdict(dict)
    for r in cursor:
        all_tag_data[r[0]][r[1]] = r[2]
    from nltk.stem.snowball import EnglishStemmer
    stemmer = EnglishStemmer()
    for app_id, tag_to_times in all_tag_data.iteritems():
        normalized_app_tag_dict = defaultdict(int)
        for tag, times in tag_to_times.iteritems():
            normalized_app_tag_dict[stemmer.stem(tag)] += times
        for tag, times in normalized_app_tag_dict.iteritems():
            cursor.execute('INSERT INTO tag_app_relation (app_id, tag, times) VALUES (%s, %s, %s)', (app_id, tag, times))
Ejemplo n.º 19
0
def nltk_tokenizer(text, min_size=4, *args, **kwargs):
	from nltk.stem.snowball import EnglishStemmer
	from nltk.corpus import stopwords as stwds
	from nltk.tokenize import TreebankWordTokenizer
	
	stemmer = EnglishStemmer()
	stopwords = set(stwds.words('english'))
	
	text = [stemmer.stem(w) for w in TreebankWordTokenizer().
			tokenize(text) if not w in stopwords 
			and len(w) >= min_size]

	return text
Ejemplo n.º 20
0
def tokenize_documents(documents):

    stop_words = stopwords.words('english') + stopwords.words('spanish') #common words to be filtered
    english = EnglishStemmer()
    arabic = ISRIStemmer()

    punctuation = { ord(char): None for char in string.punctuation}

    def valid_word(token, filtered=stop_words): 
        # Returns false for common words, links, and strange patterns
            if (token in filtered) or (token[0:4] == u'http') or\
            (token in string.punctuation):
                return False
            else:
                return True

    for doc in documents:

        row = doc[0]
        doc = doc[1]

        if doc is not None:

            # remove trailing whitespace
            doc = doc.strip()
            # remove twitter handles (words in doc starting with @)
            doc = re.sub(r"@\w+|\b@\w+", "", doc)
            # lowercase letters
            doc = doc.lower()
            # remove punctuation
            doc = doc.translate(punctuation)

            # tokenization: handles documents with arabic or foreign characters
            tokens = nltk.tokenize.wordpunct_tokenize(doc)

            cleaned_tokens = []
            for token in tokens:

                # for valid words, correct spellings of gaddafi and stem words
                if valid_word(token):
                
                    if token in [u'gadhafi', u'gadafi', u'ghadhafi', u'kadhafi', u'khadafi', u'kaddafi']:
                        token = u'gaddafi'
                    else:
                        token = arabic.stem(english.stem(token)) 

                    cleaned_tokens.append(token)    

            yield row
            yield cleaned_tokens
Ejemplo n.º 21
0
def stem_sen(list_sentences):
  stemmer = EnglishStemmer()
  # map back should be a dict with words,
  # each word map to 3 version: noun, adj, verb,
  # and each version is a list of pair
  lem = WordNetLemmatizer()
  mapping_back = {}
  res_list = []
  res_sen = []
  stemmer = EnglishStemmer()

  # of course we want to return a list of sentences back as well
  for sent in list_sentences:
    tmp_list = []
    tok_list = word_tokenize(sent)
    tok_pos = nltk.pos_tag(tok_list)
    for tok,pos in tok_pos:
      if (tok.lower() in stopwords.words('english')):
        continue
      if len(tok) == 1:
        continue
      tok = lem.lemmatize(tok)
      pos = pos[:2]
      if ('NN' not in pos) and ('JJ' not in pos) and ('VB' not in pos):
        continue
      stem_tok = stemmer.stem(tok)
      if (stem_tok not in mapping_back):
        mapping_back[stem_tok] = {}
      if pos not in mapping_back[stem_tok]:
        mapping_back[stem_tok][pos] = {}

      # increase count
      if tok not in mapping_back[stem_tok][pos]:
        mapping_back[stem_tok][pos][tok] = 1
      else:
        mapping_back[stem_tok][pos][tok] += 1
      tmp_list.append(stem_tok + '-' + pos)
    res_sen.append(tmp_list)
  res_map = {}

  # do the second run through to find the most frequent - mapping
  for tok in mapping_back:
    for pos in mapping_back[tok]:
      tmp_tok = tok + '-' + pos
      # find the most frequently, unstemmed word correspond to the stemmer + tagged
      most_freq = max(mapping_back[tok][pos], key = mapping_back[tok][pos].get)
      res_map[tmp_tok] = most_freq.encode('ascii')
      res_list.append(tmp_tok)
  return res_sen, res_list, res_map
Ejemplo n.º 22
0
 def tokenize(self):
     terms = word_tokenize(self.text);
     self.tokens = [];
     self.lemmas = []
     stemmer = EnglishStemmer();
     lemmatizer = WordNetLemmatizer()
     for term in terms:
         try:
             self.tokens.append(stemmer.stem(term).lower())
             self.lemmas.append(lemmatizer.lemmatize(term.lower()))
         except Exception, e:
             print 'current text:', self.text;
             print 'current term:', term;
             print str(e);
             sys.exit(-1);
Ejemplo n.º 23
0
def exe_compress_word(argv):
    word_stat_path, comp_word_stat_path = argv;
    stemmer = EnglishStemmer();
    word_stat = load_word_stat(word_stat_path);
    compress_word_stat = {};
    for word, count in word_stat.items():
        if count <= 0:
            continue;
        word = stemmer.stem(word.lower().decode('utf8'));
        compress_word_stat.__setitem__(word, max(word_stat.get(word,0), count));
    words = compress_word_stat.keys();
    words.sort();
    f = open(comp_word_stat_path, 'w');
    for word in words:
        f.write('%s %d\n' % (word.encode('utf8'), compress_word_stat[word]));
    f.close();
Ejemplo n.º 24
0
 def __init__(self):
     """
     class initialization:
     tokenizer- NLTK compatible tokenizer function
     stemmer- NLTK compatible stemmer 
     stop_words- list of ignored words
     lemm- NLTK compatible lemmatizer
     inv_index- (defaultdict) the inverted index
     positional_index- (defaultdict of defaultdicts) relevant for the bonus task only
     """
     # Tokenization
     self.tokenizer = word_tokenize
     
     # Stemming
     self.stemmer = EnglishStemmer()
     #self.stemmer = nltk.PorterStemmer()
     #self.stemmer = nltk.LancasterStemmer()
     
     # Stopwords
     self.stop_words = stopwords.words('english')
     
     # Lemmatization
     self.lemm = nltk.WordNetLemmatizer()
     
     # The invereted index
     self.inv_index = defaultdict(list)
     # The positional index (for the bonus task)
     self.positional_index = defaultdict(lambda: defaultdict(list))
Ejemplo n.º 25
0
def pre_proc(in_str, removestop=True, alwayskeep=False, word_punc=False, unquote=False):
    # remove accents, wordify punctuation
    in_str = strip_accents(in_str, wordify=word_punc, unquote=unquote)
    en_stem = EnglishStemmer()
    # tokenize string
    if removestop:  # remove stop words
        tok_list = filter(lambda x: x not in stopwords.words('english'), wordpunct_tokenize(in_str))
    else:
        tok_list = wordpunct_tokenize(in_str)
    new_tok_list = []
    for tok in tok_list:
        if tok not in WORD_PUNC_LIST:
            correct_spell = HOBJ.spell(tok)
            if not correct_spell:
                suggestions = [strip_accents(tmp_sug).lower() for tmp_sug in HOBJ.suggest(tok)]
            else:
                suggestions = []
            if correct_spell or (tok.lower() in suggestions):
                new_tok_list.append(tok)
                tok_stem = en_stem.stem(tok)
                if tok_stem != tok:
                    new_tok_list.append(tok_stem)
            elif len(tok) >= 3:
                tok_sug = None
                lev_perc = .34
                for sug in suggestions:
                    if not tok_sug and tok == sug[1:]:
                        tok_sug = sug
                if not tok_sug:
                    for sug in suggestions:
                        tmp_lev_perc = float(lev_dist(tok, sug)) / float(max(len(tok),len(sug)))
                        if not tok_sug and tmp_lev_perc < lev_perc:
                            tok_sug = sug
                            lev_perc = tmp_lev_perc
                if tok_sug:
                    new_tok_list.append(tok_sug)
                    tok_stem = en_stem.stem(tok_sug)
                    if tok_stem != tok_sug:
                        new_tok_list.append(tok_stem)
                elif alwayskeep:
                    new_tok_list.append(tok)
            elif alwayskeep:
                new_tok_list.append(tok)
        else:
            new_tok_list.append(tok)
    out_str = string.join(new_tok_list, ' ')
    return out_str.lower()
Ejemplo n.º 26
0
def tokenizeTweet(tweet,unique = True):
	allWords = [word.lower() for word in word_tokenize(tweet)]

    # deletes @users, RT and URLs and saves #hashtags
	nWords, i = len(allWords), 0
	hashtags = []
	while i < nWords:
		if allWords[i] == '@':      # @users
			allWords[i:i + 2] = []
			nWords -= 2
		elif allWords[i] == 'rt':   # delete RT
			allWords[i:i + 1] = []
			nWords -= 1
		elif allWords[i] == '#':    # save the hashtag
			try:
				hashtags.append(allWords[i + 1])
				allWords[i:i + 2] = []
				nWords -= 2
			except:
				allWords[i:i + 1] = []
				nWords -= 1
		elif allWords[i] == "http":     # delete url starting with http:
			allWords[i:i + 3] = []
			nWords -= 3
		elif allWords[i][0:3] == 'www':  # delete urls starting with www.
			allWords[i:i + 1] = []
			nWords -= 1
		else:
			i += 1

	possibleWords = filter(lambda x: x not in ourStopWords and x.isdigit() == False, allWords)
	stemmer = EnglishStemmer()
	tokens = []
	for word in possibleWords:
		aux = str(stemmer.stem(word))
		if unique:
			if(aux not in tokens):		# this makes each token appears only once
				tokens.append(aux)
		else:
			tokens.append(aux)			
	for tag in hashtags:		# this makes each token appears only once
		if unique:
			if '#' + tag not in tokens:
				tokens.append('#' + tag)
		else:
			tokens.append('#'+tag)
	return tokens
Ejemplo n.º 27
0
def textrank(text):
    sentences = sent_tokenize(text)

    tokenizer = RegexpTokenizer(r"\w+")
    lmtzr = EnglishStemmer()

    words = [set(lmtzr.stem(word) for word in tokenizer.tokenize(sentence)) for sentence in sentences]

    pairs = combinations(range(len(sentences)), 2)
    scores = [(i, j, similarity(words[i], words[j])) for i, j in pairs]
    scores = filter(lambda x: x[2], scores)

    g = nx.Graph()
    g.add_weighted_edges_from(scores)
    pr = nx.pagerank(g)

    return sorted(((i, pr[i], s) for i, s in enumerate(sentences) if i in pr), key=lambda x: pr[x[0]], reverse=True)
Ejemplo n.º 28
0
 def GET(self):
     #return "Hello, world!"
 #def query(req):
     data = web.input()
     word=str(data.word)
     """
     parameters = util.FieldStorage(req)
     word = parameters['word']
      
     req.write(word)
     """
     #print word
     db = MySQLdb.connect("127.0.0.1","root","","dizing" )
     cursor=db.cursor()
     snowball_stemmer = EnglishStemmer()
     stem2 = snowball_stemmer.stem(word)
     cursor.execute("SELECT * FROM words WHERE original=%s OR stem1=%s OR stem2=%s", (word,word,stem2))
     rows = cursor.fetchall()
     words1 = dict()
     words2 = dict()
     for row in rows:
         if row[1] == word or row[3]==word:
             words1[word] = row[0]
         else:
             words2[word] = row[0]
     scenes1 = []
     scenes2 = []
     
     for (i,words_dict) in [(1,words1), (2,words2)]:
         wids = words_dict.values()
         for wid in wids:
             sql = "SELECT s.sentence, s.start, s.stop, m.title FROM scene AS s, words_scenes AS ws, movie as m " + \
                            "WHERE ws.wid=%d AND ws.sid=s.sid AND s.mid = m.mid" % int(wid)
             print sql
             cursor.execute(sql)
             rows = cursor.fetchall()
             if (i==1): scenes1 += rows
             else: scenes2 += rows
     print scenes1
     print scenes2
     #req.write(str(scenes1))
     #req.write(str(scenes2))
     db.close()
     result = { 'scenes1': scenes1, 'scenes2': scenes2 }
     return json.dumps(result)
Ejemplo n.º 29
0
def text_processing(text, min_size=4, sep_char=' '):
	from nltk.stem.snowball import EnglishStemmer
	from nltk.corpus import stopwords as stwds

	stemmer = EnglishStemmer()
	stopwords = set(stwds.words('english') + 
			contractions_without_punc)
	
	text = [stemmer.stem(w) for w in text.split(sep_char) 
			if not w in stopwords
			and len(w) >= min_size]

	return text
	words = list()
	for word in text:
		words.append(stemmer.stem(word))
	
	return words
def computeSentiment(tweet_text):
    pos_count = 0
    neg_count = 0
    pos_terms = []
    neg_terms = []
    st = EnglishStemmer()

    tokenized_tweet = tokenize(tweet_text)
    for t in tokenized_tweet:
        #print st.stem(t.lower())
        if st.stem(t.lower()) in negative_terms:
            neg_terms.append(t.lower())
            neg_count += 1
        elif st.stem(t.lower()) in positive_terms:
            pos_terms.append(t.lower())
            pos_count += 1

    return pos_count, neg_count, set(pos_terms), set(neg_terms)
Ejemplo n.º 31
0
def query(index: dict, request: str):
    request = request.replace("."," ")
    request = request.replace(","," ")
    request = request.translate(str.maketrans('', '', string.punctuation))
    request = request.lower()
    request = remove_stopwords(request)
    raw_request = request
    words = request.split()
    words2 = []
    for i in words:
        words2.append(EnglishStemmer().stem(i))
    words = words2


    if len(words) == 1:
        if request in index:
            if len(index[request]) >=5:
                for i in range(5):
                    print(index[request][i])
            else:
                candidates = []
                synonyms = []
                for i in index[request]:
                    candidates.append(i)
                for syn in wordnet.synsets(raw_request): 
                    for l in syn.lemmas(): 
                        synonyms.append(l.name()) 
                for i in set(synonyms):
                    s = EnglishStemmer().stem(i)
                    for i in range(min(5,len(index[s]))):
                        candidates.append([index[s][i][0],index[s][i][1]/2,index[s][i][2],index[s][i][3],index[s][i][4]])
                candidates.sort(key = lambda x: x[1], reverse=True)
                for i in range(5):
                    print(candidates[i])
        else:
            candidates = []
            synonyms = []
            for syn in wordnet.synsets(raw_request): 
                for l in syn.lemmas(): 
                    synonyms.append(l.name()) 
            for i in set(synonyms):
                s = EnglishStemmer().stem(i)
                if s in index:
                    for i in range(min(5,len(index[s]))):
                        candidates.append([index[s][i][0],index[s][i][1]/2,index[s][i][2],index[s][i][3],index[s][i][4]])
                else:
                    continue
            candidates.sort(key = lambda x: x[1], reverse=True)
            for i in range(min(5,len(candidates))):
                print(candidates[i])
            

    if len(words) == 2:
        counter = 0
        candidates = []
        for i in index[words[0]]:
            if counter > 10:
                break
            for j in index[words[1]]:
                if i[0] == j[0]:
                    candidates.append([i[0],i[1]+j[1],i[2],i[3],i[4]])
                    counter+=1
                    if counter > 10:
                        break
        if len(candidates) >= 5:
            candidates.sort(key = lambda x: x[1], reverse=True)
            for i in range(min(5,len(candidates))):
                print(candidates[i])
        else:
            if words[0] in index:
                for i in range(min(5,len(index[words[0]]))):
                    candidates.append([index[words[0]][i][0],index[words[0]][i][1]/10000,index[words[0]][i][2],index[words[0]][i][3],index[words[0]][i][4]])
            if words[1] in index:
                for i in range(min(5,len(index[words[1]]))):
                    candidates.append([index[words[1]][i][0],index[words[1]][i][1]/10000,index[words[1]][i][2],index[words[1]][i][3],index[words[1]][i][4]])
            candidates.sort(key = lambda x: x[1], reverse=True)
            for i in range(min(5,len(candidates))):
                print(candidates[i])

    if len(words) == 3:
        counter = 0
        candidates = []
        for i in index[words[0]]:
            if counter > 10:
                break
            for j in index[words[1]]:
                if i[0] == j[0]:
                    candidates.append([i[0],i[1]+j[1],j[2],i[3],i[4]])
                    counter+=1
                    if counter > 10:
                        break
        counter2 = 0
        candidates2 = []
        for i in index[words[1]]:
            if counter2 > 10:
                break
            for j in index[words[2]]:
                if i[0] == j[0]:
                    candidates2.append([i[0],i[1]+j[1],j[2],i[3],i[4]])
                    counter2+=1
                    if counter2 > 10:
                        break
        for i in candidates2:
            candidates.append(i)
        if len(candidates) >= 5:
            candidates.sort(key = lambda x: x[1], reverse=True)
            for i in range(min(5,len(candidates))):
                print(candidates[i])
        else:
            if words[0] in index:
                for i in range(min(5,len(index[words[0]]))):
                    candidates.append([index[words[0]][i][0],index[words[0]][i][1]/10000,index[words[0]][i][2],index[words[0]][i][3],index[words[0]][i][4]])
            if words[1] in index:
                for i in range(min(5,len(index[words[1]]))):
                    candidates.append([index[words[1]][i][0],index[words[1]][i][1]/10000,index[words[1]][i][2],index[words[1]][i][3],index[words[1]][i][4]])
            if words[2] in index:
                for i in range(min(5,len(index[words[2]]))):
                    candidates.append([index[words[2]][i][0],index[words[2]][i][1]/10000,index[words[2]][i][2],index[words[2]][i][3],index[words[2]][i][4]])
            candidates.sort(key = lambda x: x[1], reverse=True)
            for i in range(min(5,len(candidates))):
                print(candidates[i])
    

    if len(words) == 4:
        counter = 0
        candidates = []
        for i in index[words[0]]:
            if counter > 10:
                break
            for j in index[words[1]]:
                if i[0] == j[0]:
                    candidates.append([i[0],i[1]+j[1],j[2],i[3],i[4]])
                    counter+=1
                    if counter > 10:
                        break
        counter2 = 0
        candidates2 = []
        for i in index[words[1]]:
            if counter2 > 10:
                break
            for j in index[words[2]]:
                if i[0] == j[0]:
                    candidates2.append([i[0],i[1]+j[1],j[2],i[3],i[4]])
                    counter2+=1
                    if counter2 > 10:
                        break
        counter3 = 0
        candidates3 = []
        for i in index[words[2]]:
            if counter3 > 10:
                break
            for j in index[words[3]]:
                if i[0] == j[0]:
                    candidates3.append([i[0],i[1]+j[1],j[2],i[3],i[4]])
                    counter3+=1
                    if counter3 > 10:
                        break
        for i in candidates3:
            candidates.append(i)
        if len(candidates) >= 5:
            candidates.sort(key = lambda x: x[1], reverse=True)
            for i in range(min(5,len(candidates))):
                print(candidates[i])
        else:
            if words[0] in index:
                for i in range(min(5,len(index[words[0]]))):
                    candidates.append([index[words[0]][i][0],index[words[0]][i][1]/10000,index[words[0]][i][2],index[words[0]][i][3],index[words[0]][i][4]])
            if words[1] in index:
                for i in range(min(5,len(index[words[1]]))):
                    candidates.append([index[words[1]][i][0],index[words[1]][i][1]/10000,index[words[1]][i][2],index[words[1]][i][3],index[words[1]][i][4]])
            if words[2] in index:
                for i in range(min(5,len(index[words[2]]))):
                    candidates.append([index[words[2]][i][0],index[words[2]][i][1]/10000,index[words[2]][i][2],index[words[2]][i][3],index[words[2]][i][4]])
            if words[3] in index:
                for i in range(min(5,len(index[words[3]]))):
                    candidates.append([index[words[3]][i][0],index[words[3]][i][1]/10000,index[words[3]][i][2],index[words[3]][i][3],index[words[3]][i][4]])    
            candidates.sort(key = lambda x: x[1], reverse=True)
            for i in range(min(5,len(candidates))):
                print(candidates[i])
        
        
    if len(words) == 5:
        counter = 0
        candidates = []
        for i in index[words[0]]:
            if counter > 10:
                break
            for j in index[words[1]]:
                if i[0] == j[0]:
                    candidates.append([i[0],i[1]+j[1],j[2],i[3],i[4]])
                    counter+=1
                    if counter > 10:
                        break
        counter2 = 0
        candidates2 = []
        for i in index[words[1]]:
            if counter2 > 10:
                break
            for j in index[words[2]]:
                if i[0] == j[0]:
                    candidates2.append([i[0],i[1]+j[1],j[2],i[3],i[4]])
                    counter2+=1
                    if counter2 > 10:
                        break
        counter3 = 0
        candidates3 = []
        for i in index[words[2]]:
            if counter3 > 10:
                break
            for j in index[words[3]]:
                if i[0] == j[0]:
                    candidates3.append([i[0],i[1]+j[1],j[2],i[3],i[4]])
                    counter3+=1
                    if counter3 > 10:
                        break
        for i in candidates3:
            candidates.append(i)
        counter4 = 0
        candidates4 = []
        for i in index[words[3]]:
            if counter4 > 10:
                break
            for j in index[words[4]]:
                if i[0] == j[0]:
                    candidates4.append([i[0],i[1]+j[1],j[2],i[3],i[4]])
                    counter4+=1
                    if counter4 > 10:
                        break
        for i in candidates4:
            candidates.append(i)
        if len(candidates) >= 5:
            candidates.sort(key = lambda x: x[1], reverse=True)
            for i in range(min(5,len(candidates))):
                print(candidates[i])
        else:
            if words[0] in index:
                for i in range(min(5,len(index[words[0]]))):
                    candidates.append([index[words[0]][i][0],index[words[0]][i][1]/10000,index[words[0]][i][2],index[words[0]][i][3],index[words[0]][i][4]])
            if words[1] in index:
                for i in range(min(5,len(index[words[1]]))):
                    candidates.append([index[words[1]][i][0],index[words[1]][i][1]/10000,index[words[1]][i][2],index[words[1]][i][3],index[words[1]][i][4]])
            if words[2] in index:
                for i in range(min(5,len(index[words[2]]))):
                    candidates.append([index[words[2]][i][0],index[words[2]][i][1]/10000,index[words[2]][i][2],index[words[2]][i][3],index[words[2]][i][4]])
            if words[3] in index:
                for i in range(min(5,len(index[words[3]]))):
                    candidates.append([index[words[3]][i][0],index[words[3]][i][1]/10000,index[words[3]][i][2],index[words[3]][i][3],index[words[3]][i][4]])
            if words[4] in index:
                for i in range(min(5,len(index[words[4]]))):
                    candidates.append([index[words[4]][i][0],index[words[4]][i][1]/10000,index[words[4]][i][2],index[words[4]][i][3],index[words[4]][i][4]])        
            candidates.sort(key = lambda x: x[1], reverse=True)
            for i in range(min(5,len(candidates))):
                print(candidates[i])

    return
Ejemplo n.º 32
0
test = 'this is a test \'string\' where the stop can\'t words should be removed, also we want to use synonyms to get a better result.'

stop = stopwords.words('english')

tokenizer = RegexpTokenizer(r'\w+')
result1 = ([i for i in tokenizer.tokenize(test)])
print result1

tokenizer2 = TweetTokenizer()
result12 = ([i for i in tokenizer.tokenize(test)])
print result12

result2 = ([i for i in result1 if i not in stop])
print result2

st1 = LancasterStemmer()
result3 = ([st1.stem(i) for i in result2])
print result3

st2 = EnglishStemmer()
result4 = ([st2.stem(i) for i in result2])
print result4

st3 = WordNetLemmatizer()
result5 = ([st3.lemmatize(i) for i in result2])
print result5

st4 = PorterStemmer()
result6 = ([st4.stem(i) for i in result2])
print result6
Ejemplo n.º 33
0
 def __init__(self):
     self.stemmer = EnglishStemmer()
     return
Ejemplo n.º 34
0
import nltk
from nltk.stem.lancaster import LancasterStemmer
from nltk.stem.porter import PorterStemmer
from nltk.stem.snowball import EnglishStemmer
from nltk.stem import WordNetLemmatizer
from flask import abort
from helpers import ret_success
from helpers import ret_failure
from helpers import parse_input
from helpers import penn_to_wn

LancasterSt = LancasterStemmer()
PorterSt = PorterStemmer()
SnowballSt = EnglishStemmer()
WordnetLm = WordNetLemmatizer()


def stemmer(method, data):
    """
	Takes an array of words in JSON format.
	"""
    data = parse_input(data)
    if data == False:
        return ret_failure(703)
    else:
        res = []
        if method == "lancaster":
            for word in data:
                try:
                    res.append([word, LancasterSt.stem(word)])
                except:
class EnglishAnalyzer(object):
    extra_stop_words = None
    ngram_range = (1, 1)
    alphafilter = re.compile(r"(?u)[^a-z ]+")
    token_pattern = re.compile(r"(?u)\b\w\w+\b")
    stemmer = None

    def __init__(self, extra_stop_words=None, ngram_range=(1, 1)):
        self.stemmer = EnglishStemmer()
        self.ngram_range = ngram_range

        self.stop_words = ENGLISH_STOP_WORDS
        if extra_stop_words is not None:
            self.stop_words = self.stop_words | set(extra_stop_words)

    def analyze(self, doc):
        '''
        replaces the analyze function of any sk-learn text vectorizer with improved text handling, namely:
        - filter only alphabetic words
        - stem words
        - enhanced stopwords
        '''
        return self.word_ngrams(
            self.filter_stem(self.tokenize(self.preprocess(doc))),
            self.ngram_range)

    def preprocess(self, doc):
        return self.alphafilter.sub(' ', strip_accents_ascii(doc.lower()))

    def tokenize(self, doc):
        return self.token_pattern.findall(doc)

    def filter_stem(self, tokens):
        return [
            self.stemmer.stem(w) for w in tokens if w not in self.stop_words
        ]

    def word_ngrams(self, tokens, ngram_range):
        # handle token n-grams, copied from
        # https://github.com/scikit-learn/scikit-learn/blob/master/sklearn/feature_extraction/text.py#L146-L175

        min_n, max_n = self.ngram_range
        if max_n != 1:
            original_tokens = tokens
            if min_n == 1:
                # no need to do any slicing for unigrams
                # just iterate through the original tokens
                tokens = list(original_tokens)
                min_n += 1
            else:
                tokens = []

            n_original_tokens = len(original_tokens)

            # bind method outside of loop to reduce overhead
            tokens_append = tokens.append
            space_join = " ".join

            for n in range(min_n, min(max_n + 1, n_original_tokens + 1)):
                for i in range(n_original_tokens - n + 1):
                    tokens_append(space_join(original_tokens[i:i + n]))

        return tokens
def pre_process_dev_test(root_d,
                         input_f,
                         min_freq,
                         word2inx_path,
                         count_in_sent=False,
                         data_type='dev'):
    """

    :param root_d:
    :param input_f:
    :param min_freq: minimum frequency, an integer
    :param count_in_sent: if need to count words in sentence
    :return:
    """
    y = {}
    X = {}
    i = 0
    stemmer = EnglishStemmer()
    with open(os.path.join(root_d, input_f)) as f_handle:
        for each_line in f_handle:
            # each_line = '2 If you \'re paying attention , the `` big twists '' are pretty easy to guess - but that does n\'t make the movie any less entertaining .'
            each_line = each_line.strip()
            if data_type == 'dev':
                y[i] = each_line[0]
                x = each_line[1:].strip().lower()
            else:
                y[i] = 0
                x = each_line[:].strip().lower()
            x = re.sub(r'[\']?\d+[st]*', 'number', x)
            x = re.sub(r'\\/', ' ', x)
            x = re.sub(r'ca n\'t', 'can not', x)
            x = re.sub(r'n\'t', 'not', x)
            x = re.sub(r'\'re', 'are', x)
            x = re.sub(r'\'m', 'am', x)
            x = re.sub(r'it \'s', 'it is', x)
            x = re.sub(r'that \'s', 'that is', x)
            x = re.sub(r'there \'s', 'there is', x)
            x = re.sub(r'\?', 'question_mark', x)
            x = re.sub(r'!', 'exclamation_mark', x)
            x = unicodedata.normalize('NFKD', x).encode('ascii', 'ignore')
            x_list = x.decode('utf-8').split(' ')
            x_list_stemmed = [stemmer.stem(word)
                              for word in x_list]  # stemming
            X[i] = x_list_stemmed
            i += 1
    print(i)
    if data_type == 'dev':
        y_json_path = os.path.join(root_d, 'y_dev.json')
        X_json_path = os.path.join(root_d, 'X_dev.json')
        if not os.path.exists(y_json_path):
            with open(y_json_path, 'a') as f_handle:
                json.dump(y, f_handle, indent=2)
        if not os.path.exists(X_json_path):
            with open(X_json_path, 'a') as f_handle:
                json.dump(X, f_handle, indent=2)

    # sentence to index and convert y to np.array
    X_inx = {}
    X_new = {}
    y_array = np.zeros([len(y)], dtype=np.int)
    if os.path.exists(word2inx_path):
        with open(word2inx_path, 'r') as f_handle:
            word2inx = json.load(f_handle)
    else:
        print("There is no 'word2inx.json' file, please run train.py first")
    for i, m in enumerate(sorted(list(X.keys()))):
        each_sentence = X[m]
        X_inx[m] = []
        X_new[m] = []
        y_array[i] = y[i]
        for word in each_sentence:
            if word in word2inx:
                X_new[m].append(word)
                X_inx[m].append(word2inx[word])
                # print(' '.join(X_new[m]))
    # convert sentence to vector
    X_matrix = np.zeros([len(X_inx), len(word2inx)], dtype=np.int)
    total_sentence_inx = sorted(list(X_inx.keys()))
    # print(total_sentence_inx)
    for i in total_sentence_inx:
        if count_in_sent:
            inx_num = count_num_in_sent(X_inx[i])
            X_matrix[i][inx_num['inxs']] = inx_num['num']
        else:
            X_matrix[i][X_inx[i]] = 1
        # if i == 1:
        #     print(X_inx[i])
        # print(inx_num)
    # delete the sentences that have no word
    if data_type == 'test':
        return {'X': X_matrix}
    sen_inx = np.where(X_matrix.sum(1) != 0)
    X_matrix = X_matrix[sen_inx]
    y_array = y_array[sen_inx]
    if data_type == 'dev':
        return {'X': X_matrix, 'y': y_array}
Ejemplo n.º 37
0

###############################################################################
# Process data
###############################################################################

# Processing the text
# 1) Tokenize and lowercase
# 2) Remove punctuation and special characters
# 3) Remove stopwords
# 4) Apply stemming
# 5) Removing blank/empty strings and single characters from analysis

stopwords = nltk.corpus.stopwords.words('english')
word_lemma = WordNetLemmatizer()
word_stem = EnglishStemmer()


def process_text(text):
    punctuation_to_remove = string.punctuation + "’‘—“”"
    strip = str.maketrans('', '', punctuation_to_remove)
    sub_filter = r"\b[a-zA-Z]\b"
    p_text = list(
        filter(None, [
            re.sub(sub_filter, "", word_stem.stem(word.translate(strip)))
            for word in tknzr(text.lower()) if word not in stopwords
        ]))
    return p_text


###############################################################################
def set_stemmer(stemmer_language):
    if (stemmer_language == "GER"):
        stemmers = GermanStemmer()
    else:
        stemmers = EnglishStemmer()
    return stemmers
le = LabelEncoder()
le.fit(y_train)
y_train = le.transform(y_train)

print('after: %s ...' % y_train[:5])

# In[435]:

# Porter Stemmer

import nltk
import string
import re
from nltk.stem.snowball import EnglishStemmer
snowball = EnglishStemmer()
porter_stemmer = nltk.stem.porter.PorterStemmer()


def porter_tokenizer(text, stemmer=porter_stemmer):
    lower_txt = text.lower()
    tokens = nltk.wordpunct_tokenize(lower_txt)
    stems = [porter_stemmer.stem(t) for t in tokens]
    no_punct = [s for s in stems if re.match('^[a-zA-Z]+$', s) is not None]
    return no_punct


def snowball_tokenizer(text, stemmer=snowball):
    lower_txt = text.lower()
    tokens = nltk.wordpunct_tokenize(lower_txt)
    stems = [snowball.stem(t) for t in tokens]
Ejemplo n.º 40
0
    if tempEmail.is_multipart():
        for part in tempEmail.walk():
            ctype = part.get_content_type()
            cdispo = str(part.get('Content-Disposition'))

            if ctype == 'text/plain' and 'attachment' not in cdispo:
                tempMessage = part.get_payload(decode=False)
                break
    else:
        tempMessage = tempEmail.get_payload(decode=False)

    tempSubject = tempSubject.lower()
    tempMessage = tempMessage.lower()

    es = EnglishStemmer()
    dist = re.sub(r'[^a-zA-Z]', " ", tempMessage)
    dis = word_tokenize(dist)

    for token in dis:

        if token in stopwords.words('english'):
            continue

        else:
            if token in dictionary:
                dictionary[token] += 1
            else:
                dictionary[token] = 1

    forwardIndex[route] = dictionary
def pre_process(root_d, input_f, min_freq, count_in_sent=False):
    """
    :param root_d:
    :param input_f:
    :param min_freq: minimum frequency, an integer
    :param count_in_sent: if need to count words in sentence
    :return:
    """
    y = {}
    X = {}
    word_count = {}
    word2inx = {}
    inx2word = []
    i = 0
    stemmer = EnglishStemmer()
    with open(os.path.join(root_d, input_f)) as f_handle:
        for each_line in f_handle:
            # each_line = '2 If you \'re paying attention , the `` big twists '' are pretty easy to guess - but that does n\'t make the movie any less entertaining .'
            each_line = each_line.strip()
            y[i] = each_line[0]
            x = each_line[1:].strip().lower()
            x = re.sub(r'[\']?\d+[st]*', 'number', x)
            x = re.sub(r'\\/', ' ', x)
            x = re.sub(r'ca n\'t', 'can not', x)
            x = re.sub(r'n\'t', 'not', x)
            x = re.sub(r'\'re', 'are', x)
            x = re.sub(r'\'m', 'am', x)
            x = re.sub(r'it \'s', 'it is', x)
            x = re.sub(r'that \'s', 'that is', x)
            x = re.sub(r'there \'s', 'there is', x)
            x = re.sub(r'\?', 'question_mark', x)
            x = re.sub(r'!', 'exclamation_mark', x)
            x = unicodedata.normalize('NFKD', x).encode('ascii', 'ignore')
            x_list = x.decode('utf-8').split(' ')
            x_list_stemmed = [stemmer.stem(word)
                              for word in x_list]  # stemming
            X[i] = x_list_stemmed
            # X[i] = x_list
            i += 1
    # y_json_path = os.path.join(root_d, 'y_train.json')
    # X_json_path = os.path.join(root_d, 'X_train.json')
    # if not os.path.exists(y_json_path):
    #     with open(y_json_path, 'a') as f_handle:
    #         json.dump(y, f_handle, indent=2)
    # if not os.path.exists(X_json_path):
    #     with open(X_json_path, 'a') as f_handle:
    #         json.dump(X, f_handle, indent=2)
    # count
    for m in X:
        each_sentence = X[m]
        for word in each_sentence:
            if word not in word_count:
                word_count[word] = 1
            else:
                word_count[word] += 1
    # word_count_path = os.path.join(root_d, 'word_count_in_train.json')
    # if not os.path.exists(word_count_path):
    #     with open(word_count_path, 'a') as f_handle:
    #         json.dump(word_count, f_handle, indent=2)
    inx = 0
    for m in sorted(word_count.keys()):
        if (not re.search(r'^\W+$',
                          m)) and (word_count[m] >= min_freq) and (len(m) > 1):
            word2inx[m] = inx  # these words will become word vector
            inx2word.append((inx, m))
            inx += 1
    # inx2word_path = os.path.join(root_d, 'inx2word.json')
    # if not os.path.exists(inx2word_path):
    #     with open(inx2word_path, 'a') as f_handle:
    #         json.dump(inx2word, f_handle, indent=2)
    # used in test
    word2inx_path = os.path.join(root_d, 'word2inx.json')
    if not os.path.exists(word2inx_path):
        with open(word2inx_path, 'a') as f_handle:
            json.dump(word2inx, f_handle, indent=2)

    # sentence to index and convert y to np.array
    X_inx = {}
    X_new = {}
    y_array = np.zeros([len(y)], dtype=np.int)
    for i, m in enumerate(sorted(list(X.keys()))):
        each_sentence = X[m]
        X_inx[m] = []
        X_new[m] = []
        y_array[i] = y[i]
        for word in each_sentence:
            if word in word2inx:
                X_new[m].append(word)
                X_inx[m].append(word2inx[word])
        # print(' '.join(X_new[m]))
    # convert sentence to vector
    X_matrix = np.zeros([len(X_inx), len(word2inx)], dtype=np.int)
    total_sentence_inx = sorted(list(X_inx.keys()))
    # print(total_sentence_inx)
    inx_num = {}
    for i in total_sentence_inx:
        if count_in_sent:
            inx_num = count_num_in_sent(X_inx[i])
            X_matrix[i][inx_num['inxs']] = inx_num['num']
        else:
            X_matrix[i][X_inx[i]] = 1
        # if i == 1:
        #     print(X_inx[i])
        # print(inx_num)
    # delete the sentences that have no word
    sen_inx = np.where(X_matrix.sum(1) != 0)
    X_matrix = X_matrix[sen_inx]
    y_array = y_array[sen_inx]
    # np.save('X', X_matrix)
    # np.save('y', y_array)
    return {'X': X_matrix, 'y': y_array}
Ejemplo n.º 42
0
import pandas as pd
import numpy as np
from nltk.stem.snowball import EnglishStemmer
from nltk.tokenize import wordpunct_tokenize
import sys

reload(sys)
sys.setdefaultencoding('ISO-8859-1')
stemmer = EnglishStemmer()

with open('./input/train.csv', mode='r') as f1:
    with open('./input/train_stemmed.csv', 'w') as f2:
        lines = f1.readlines()
        for i, line in enumerate(lines):
            f2.write(" ".join([
                stemmer.stem(word.lower()) for word in wordpunct_tokenize(line)
            ]) + "\n")

with open('./input/test.csv', mode='r') as f1:
    with open('./input/test_stemmed.csv', 'w') as f2:
        lines = f1.readlines()
        for i, line in enumerate(lines):
            f2.write(" ".join([
                stemmer.stem(word.lower()) for word in wordpunct_tokenize(line)
            ]) + "\n")

with open('./input/product_descriptions.csv', mode='r') as f1:
    with open('./input/product_descriptions_stemmed.csv', 'w') as f2:
        lines = f1.readlines()
        for i, line in enumerate(lines):
            f2.write(" ".join([
Ejemplo n.º 43
0
import os
from nltk.stem.snowball import EnglishStemmer
from pkgutil import get_data

# Variables
# ---------------------------------------------------------------------------------------------
stopwords = {
    word.strip()
    for word in str(get_data('data', 'stopwords.txt').decode('utf-8')).split(
        '\n')
}
stemmer = EnglishStemmer()  # stemmer class
wordPattern = "^[^\W\d_]+$"  # regex pattern to match a word
epsilon = 1e-4  # epsilon value for algorithm
damping = 0.85  # damping value for algorithm
delta = 1e-7  # delta value for algorithm
# ---------------------------------------------------------------------------------------------
Ejemplo n.º 44
0
text_rem = [x for x in text_3 if x not in text_4]

##we're going to use a similar format to apply various stemming/lemmatizing/synonyms algorithms

from nltk.stem.lancaster import LancasterStemmer

st = LancasterStemmer()

from nltk.stem import PorterStemmer

pt = PorterStemmer()

from nltk.stem.snowball import EnglishStemmer

sb = EnglishStemmer()

from nltk.stem.wordnet import WordNetLemmatizer

wn = WordNetLemmatizer()

##let's examine the word ``better"
st.stem('better')
pt.stem('better')
sb.stem('better')
wn.lemmatize('better', 'a')

wn.lemmatize('families', 'n')

##
##applying the porter stemmer to the gettysburg address
Ejemplo n.º 45
0
from nltk.corpus import stopwords  #import stopwords from nltk corpus
from nltk.stem.snowball import FrenchStemmer  #import the French stemming library
from nltk.stem.snowball import EnglishStemmer  #import the English stemming library
from nltk.tokenize import TreebankWordTokenizer  #import the Treebank tokenizer
from nltk.tokenize import WordPunctTokenizer
from nltk.tokenize import TreebankWordTokenizer

from nltk.probability import FreqDist

#import lib to detect language
import elastic.detect_lang
import elastic.common as common

#name stemmers
stemmer_fr = FrenchStemmer()
stemmer_en = EnglishStemmer()

# Load tokenizer
# You can choose the most efficient, however wordpunct is working well
#tokenizer = TreebankWordTokenizer()
tokenizer = WordPunctTokenizer()


# stemer function text
def stem_tokens(tokens, stemmer):
    stemmed = []
    for item in tokens:
        stemmed.append(stemmer.stem(item))
    return stemmed

Ejemplo n.º 46
0
def stemming_tokenizer(text, stemmer = EnglishStemmer()):
	stemmed_text = [stemmer.stem(word) for word in word_tokenize(text, language='english')]
	return stemmed_text
Ejemplo n.º 47
0
class DocParse:
    max_sentence = 10000
    include_file = ''
    outfile = ''
    count = 0

    def __init__(self):
        self.stemming_on = False
        self.stop_word_on = False
        self.summary = False
        self.use_threshold = False
        self.max_words_in_summary = 100
        self.keep_all = True  # by default do not exclude dupe words
        self.normalize = False
        self.rval = 0
        self.score = 'size'  # size | tfidf | stfidf
        self.update = False
        self.penalty = False
        self.total_sentences = 0
        self.sentence_dictionary = defaultdict(list)  # map of modified sentences to actual sentences (tokenized)

        self.dictionary = {}
        # keys are final tokenized output
        # values are 2-tuple of original sentence and size

        self.mod_words = ()  # all unique words of document
        self.mod_sentences = ((),)
        self.unique_sent = ((),)
        self.alg = Alg.Algorithms()
        self.stemmer = EnglishStemmer()
        self.doc_size = 0

    def tokenize(self, in_file):
        """Reads in_file and tokenizes into words."""

        global debug_on
        global punctuations
        if debug_on: print('stem:', self.stemming_on)
        if debug_on: print('stop:', self.stop_word_on)
        if debug_on: print('keep:', self.keep_all)
        f = open(in_file)
        raw = f.read()
        sentences_list = []
        words_list = []
        dictionary_values = []
        sent_tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')
        raw_sentences = sent_tokenizer.tokenize(raw)
        self.total_sentences = len(raw_sentences)

        # regex to match for xml type strings
        regex = re.compile('</|<[A-Za-z]|[A-Za-z]>')

        # operate on each sentence string from in_file
        for s, sentence in enumerate(raw_sentences):
            if debug_on: print('sentence #', str(s + 1))

            # if regex match count greater than 2, reduce sentence to nothing
            count = len(re.findall(regex, sentence))
            if count > 2:
                sentence = " "

            # remove newlines, after tokenizer.
            sentence = sentence.replace('\n', ' ')
            if debug_on: print(s, sentence[0], sentence)
            # change sentence into tokens (words)
            tokens = nltk.word_tokenize(sentence)
            # create table that maps all punctuation marks to None
            table = str.maketrans({key: None for key in string.punctuation if key != '\''})
            # keep only words and numbers
            words = [word.lower() for word in tokens if (word.translate(table) and word != '\'')]
            if debug_on:
                print("nltk tokens", end=":")
                print(tokens)
                print("parsed words", end=": ")
                print(words)
                print(len(words))
            sentence_size = len(words)
            if debug_on: print('sent len:', str(sentence_size))
            # remove stop words
            if self.stop_word_on:
                filtered_words = [word for word in words if word not in stopwords.words('english')]
                words = filtered_words
            # stem words
            if self.stemming_on:
                filtered_words = [self.stemmer.stem(word) for word in words]
                words = filtered_words
            if debug_on: print('after filters:', str(words))
            # compress sentences to unique words only if not doing greedy3 or tf-idf
            if self.keep_all:
                unique_words = words
                # removes repeated sentences
                if words not in sentences_list:
                    sentences_list.append(words)
                    dictionary_values.append((sentence, sentence_size, s))
            else:
                # make list of unique words from current sentence
                unique_words = list(set(words))
                # if unique word set not in sentence list than add this set
                # all repeated sentences will be removed at this stage
                if unique_words not in sentences_list:
                    sentences_list.append(unique_words)
                    # update local dictionary that maps index to tuple (original sentence, and length)
                    dictionary_values.append((sentence, sentence_size, s))
            if debug_on: print(sentences_list)

            # add unique words to doc word list
            for w, word in enumerate(words):
                if word not in words_list:
                    words_list.append(word)

            # add the modified sentence into dictionary
            self.sentence_dictionary[tuple(unique_words)].append(sentence)

        f.close()

        # this loop changes all the sentences of sentence_list into tuples
        for s, sentence in enumerate(sentences_list):
            sentences_list[s] = tuple(sentence)
            self.dictionary[sentences_list[s]] = dictionary_values[s]

        # store word list as tuple
        # store sentence list as tuple
        self.mod_words = tuple(words_list)
        self.mod_sentences = tuple(sentences_list)
        self.doc_size = len(self.mod_sentences)

    def find_dominating_set(self, option='greedy'):
        if option == 'greedy':
            if self.score == 'size':
                if self.normalize:
                    self.do_g_unique()
                else:
                    self.do_g_size()
            elif self.score == 'tfidf':
                self.do_g_tfidf()
            elif self.score == 'stfidf':
                self.do_g_stfidf()
        elif option == 'dynamic':
            self.do_dynamic()
        elif option == 'optimal':
            if self.optimal_type == 'dp':
                self.do_bottomup()
            elif self.optimal_type == 'ilp':
                self.do_ilp()
        elif option == 'mcdonald':
            self.do_mcdonald()

    def do_mcdonald(self):
        global debug_on
        self.alg.mcdonald(self.mod_sentences, self.mod_words, self.dictionary,
                          use_threshold=self.use_threshold, word_count=self.max_words_in_summary)
        print(self.make_summary(self.alg.dynamic_ans))

    def do_g_size(self):
        global debug_on
        answer = self.alg.greedy(self.mod_sentences, self.mod_words, self.dictionary,
                                 update=self.update, penalty=self.penalty,
                                 word_threshold=self.use_threshold,
                                 word_count=self.max_words_in_summary)
        if debug_on: print('greedy answer', answer)
        if self.summary:
            print(self.make_summary(answer))
        else:
            print('len(ans):', len(answer))
            print('len(doc):', self.total_sentences)
        if debug_on:
            print('*****')
            print(self.sentence_dictionary)
            print('*****')

    def do_g_unique(self):
        global debug_on
        answer = self.alg.greedy2(self.mod_sentences, self.mod_words, self.sentence_dictionary, self.dictionary,
                                  rval=self.rval,
                                  update=self.update, penalty=self.penalty,
                                  word_threshold=self.use_threshold,
                                  word_count=self.max_words_in_summary)
        if debug_on: print('greedy answer', answer)
        if self.summary:
            print(self.make_summary(answer))
        else:
            print('len(ans):', len(answer))
            print('len(doc):', self.total_sentences)
        if debug_on:
            print('*****')
            print(self.sentence_dictionary)
            print('*****')

    def do_g_tfidf(self):
        global debug_on
        self.answer = self.alg.tfidf(self.mod_sentences, self.mod_words, self.dictionary,
                                     rval=self.rval,
                                     ratio=self.normalize,
                                     update=self.update, penalty=self.penalty,
                                     word_count=self.max_words_in_summary,
                                     use_threshold=self.use_threshold)
        if debug_on: print('tfidf answer', self.answer)
        if self.summary:
            print(self.make_summary(self.answer))
        else:
            print('len(ans):', len(self.
                                   answer))
            print('len(doc):', self.total_sentences)
            if debug_on:
                print('*****')
                print(self.sentence_dictionary)
                print('*****')

    def do_g_stfidf(self):
        global debug_on
        answer = self.alg.stfidf(self.mod_sentences, self.mod_words, self.dictionary,
                                 rval=self.rval,
                                 update=self.update, penalty=self.penalty,
                                 ratio=self.normalize,
                                 word_count=self.max_words_in_summary,
                                 use_threshold=self.use_threshold)
        if debug_on: print('tfidf answer', answer)
        if self.summary:
            print(self.make_summary(answer))
        else:
            print('len(ans):', len(answer))
            print('len(doc):', self.total_sentences)
            if debug_on:
                print('*****')
                print(self.sentence_dictionary)
                print('*****')

    def do_g_rtfidf(self):
        global debug_on
        answer = self.alg.tfidf(self.mod_sentences, self.mod_words, self.dictionary, ratio=True,
                                use_threshold=self.use_threshold)
        if debug_on: print('tfidf answer', answer)
        if self.summary:
            print(self.make_summary(answer))
        else:
            print('len(ans):', len(answer))
            print('len(doc):', self.total_sentences)
            if debug_on:
                print('*****')
                print(self.sentence_dictionary)
                print('*****')

    def  do_bottomup(self):
        global debug_on
        self.alg.bottom_up(self.mod_sentences)
        if self.summary:
            # print(self.alg.dynamic_ans)
            print(self.make_summary(self.alg.dynamic_ans))
        else:
            print('len(ans):', len(self.alg.dynamic_ans))
            print('len(doc):', self.total_sentences)

    def do_ilp(self):
        global debug_on
        self.alg.ilp(self.mod_sentences)
        print('alg.dynamic_ans:\n', self.alg.dynamic_ans)
        if self.summary:
            # print(self.alg.dynamic_ans)
            print(self.make_summary(self.alg.dynamic_ans))
        else:
            print('len(ans):', len(self.alg.dynamic_ans))
            print('len(doc):', self.total_sentences)

    def do_dynamic(self):
        global debug_on
        if debug_on: print(self.mod_sentences)
        if debug_on: print(self.mod_words)
        if self.doc_size > 20:
            print('too many sentences:', self.doc_size)
            return
        # else:
        #     print('there are', self.doc_size, 'sentences')
        self.alg.dynamic(self.mod_sentences, self.mod_words)
        # self.sd.dynamic_lookup(set_of_sents, set_of_words)
        if debug_on: print('')
        self.alg.dynamic_calc_answer(self.mod_sentences, self.mod_words)
        if debug_on: print(self.alg.dynamic_ans)
        if debug_on:
            for i, items in enumerate(self.alg.dynamic_ans):
                print(i, ":", items)
        if self.summary:
            print(self.make_summary(self.alg.dynamic_ans))
        else:
            print('len(ans):', len(self.alg.dynamic_ans))
            print('len(doc):', self.total_sentences)
        # print 'dynamic answer', answer
        pass

    def make_summary(self, sentences):
        global debug_on
        ret_val = []
        word_count = 0
        for sentence in sentences:
            if self.dictionary[sentence][1] <= (self.max_words_in_summary - word_count) or \
                            self.max_words_in_summary == 0:
                if debug_on: print(str(self.dictionary[sentence][1]) + ": " + self.dictionary[sentence][0])
                ret_val.append(self.dictionary[sentence][0])
                word_count += self.dictionary[sentence][1]
            else:
                if debug_on: print(str(self.dictionary[sentence][1]) + ": " + self.dictionary[sentence][0])
                ret_val.append(self.shorten(self.dictionary[sentence][0], self.max_words_in_summary - word_count))
                break
                pass
        if self.outfile:
            with open(self.outfile, 'w') as f:
                f.write(" ".join(ret_val))
            pass
        return " ".join(ret_val)
        pass

    def shorten(self, sentence, length):
        global punctuations
        global debug_on
        tokens = nltk.word_tokenize(sentence)
        # remove all non-alphanumeric characters
        words_used = 0
        words = []
        for word in tokens:
            if words_used == length:
                break
            if debug_on: print(word, end=' ')
            words.append(word)
            if word not in punctuations:
                words_used += 1
                if debug_on: print('keep', words_used)
            else:
                if debug_on: print('remove')
        # words = [word for word in tokens if word not in punctuations]
        # return " ".join(words[:length-len(words)])
        return " ".join(words)
Ejemplo n.º 48
0
        count2 = 0
        for word, index in self.index.items():
            print(word, ':', index)
            temp = index[0][0]
            for i in index:
                if (i[0] != temp):
                    temp = i[0]
                    count = count + 1
                if (i[0] == 0):
                    count1 = count1 + 1
                else:
                    count2 = count2 + 1
        return [count / count1, count / count2, count]


index = Index(nltk.word_tokenize, EnglishStemmer(),
              nltk.corpus.stopwords.words('english'))

for i in range(0, 2):
    #COnverting .pdf file to .txt file
    reader = PyPDF2.PdfFileReader('E:/C/Python/IWP_Project/test' + str(i) +
                                  '.pdf')
    text = data_func.convert_pdf_to_string('E:/C/Python/IWP_Project/test' +
                                           str(i) + '.pdf')
    text_file = open('E:/C/Python/IWP_Project/test' + str(i) + '.txt', 'w')
    n = text_file.write(text)
    text_file.close()

    file = open('E:/C/Python/IWP_Project/test' + str(i) + '.txt')
    read = file.read()
    file.seek(0)
Ejemplo n.º 49
0
from nltk import word_tokenize
from nltk.stem.snowball import EnglishStemmer
from langdetect import detect
from langdetect.lang_detect_exception import LangDetectException
from stop_words import *
import string
from pymystem3 import Mystem
from nltk.corpus import stopwords
import nltk

nltk.download("stopwords")
nltk.download('punkt')

rus_stem = Mystem()
en_stem = EnglishStemmer()


def detect_language(text):
    try:
        return detect(text)
    except LangDetectException:
        return "en"


def stem(text):
    return [
        word for word in rus_stem.lemmatize(text)
        if not set(string.punctuation) & set(word) and word.strip()
        and word not in stopwords.words('russian').extend('https', 'ru')
    ]
Ejemplo n.º 50
0
 def stem_lokens(tokens):
     sst = EnglishStemmer()
     return [[sst.stem(word) for word in tokens_i] for tokens_i in tokens]
Ejemplo n.º 51
0
 def __init__(self):
     self.featureLookup={}
     self.class_map_dic={}
     self.index = Index(nltk.word_tokenize,
                   EnglishStemmer(),
                   nltk.corpus.stopwords.words('english'))
Ejemplo n.º 52
0
 def __init__(self):
     self.stoplist = set(stopwords.words('english'))
     self.stemmer = EnglishStemmer()
Ejemplo n.º 53
0
class CleanText:

    def __init__(self, stop_en=None, stop_th=None, keyword=None):

        import re
        import os
        from nltk.stem.snowball import EnglishStemmer

        self.pattern_thai_char = re.compile(u'[\u0e00-\u0e7f]')
        self.pattern_new_sentence = re.compile('\.[0-9]+(\)|\.) ')
        self.pattern_th_out = re.compile(u'[\u0e00-\u0e7f][^\u0e00-\u0e7f]')
        self.pattern_th_in = re.compile(u'[^\u0e00-\u0e7f][\u0e00-\u0e7f]')
        self.pattern_num_bullet = re.compile('^[0-9]+(\)|\.)*$')
        self.pattern_eng_token = re.compile('^[a-zA-Z]+$')
        self.pattern_number = re.compile('\+*[0-9]+')
        self.pattern_phone_number = re.compile('[0-9]+-[0-9]+-[0-9]+')
        self.pattern_email = re.compile('[a-zA-Z._\-0-9]+@[a-zA-Z._\-0-9]+')
        self.pattern_url = re.compile('(https://|www.)[a-zA-Z0-9]+.[a-z]+[^\s]*')
        self.pattern_sentence_collide = re.compile('[a-z][A-Z]]')
        self.pattern_thai_name = re.compile(u'\u0e04\u0e38\u0e13\s*[\u0e00-\u0e7f]+\s+')
        self.pattern_prefix_garbage = re.compile('^\-|^\||^\.|^\#{1,2}|^(\-\|)|^(\+\|)|^(\#\|)^(\.\|)')
        self.charset = {}
        with open(os.path.join(os.path.dirname(os.path.realpath(__file__)), 'dict', 'charset'), 'rt') as charfile:
            for item in charfile.read().split('\n'):
                if len(item) < 4:
                    self.charset[item] = ord(item)
                else:
                    self.charset[chr(int(item, 16))] = int(item, 16)
        self.stemming = EnglishStemmer()

        if stop_en:
            with open(os.path.join(os.path.dirname(os.path.realpath(__file__)), 'dict', stop_en), 'rt', encoding='utf-8') as stop_file:
                self.stop_en = set([item for item in stop_file.read().split('\n')])
        else:
            self.stop_en = set([])
        if stop_th:
            with open(os.path.join(os.path.dirname(os.path.realpath(__file__)), 'dict', stop_th), 'rt', encoding='utf-8') as stop_file:
                self.stop_th = set([item for item in stop_file.read().split('\n')])
        else:
            self.stop_th = set([])
        if keyword:
            with open(os.path.join(os.path.dirname(os.path.realpath(__file__)), 'dict', keyword), 'rt', encoding='utf-8') as keyword_file:
                self.keyword = set([item for item in keyword_file.read().split('\n')])
        else:
            self.keyword = set([])

    def clean_text(self, text):

        def validate_char(val_text):
            val_text = val_text.replace('&amp;', ' ')
            val_text = val_text.replace('&nbsp;', ' ')
            ret_text = ''
            for cha in val_text:
                try:
                    self.charset[cha]
                except KeyError:
                    ret_text += ' '
                else:
                    ret_text += cha
            while self.pattern_prefix_garbage.search(ret_text):
                ret_text = self.pattern_prefix_garbage.sub(' ', ret_text)
            while ret_text.find('  ') != -1:
                ret_text = ret_text.replace('  ', ' ')
            return ret_text

        def split_th_en(splt_text):
            insert_pos = []
            splt_text = splt_text[:]
            for pos, item in enumerate(splt_text[:-2]):
                if self.pattern_th_in.search(splt_text[pos:pos+2]) or self.pattern_th_out.search(splt_text[pos:pos+2]):
                    insert_pos.append(pos + 1)
            for pos in reversed(insert_pos):
                splt_text = splt_text[:pos] + ' ' + splt_text[pos:]
            return splt_text

        def remove_thai_stop(th_text):
            stop_pos = [[0, 0]]
            ## TH : do longest matching
            for j in range(len(th_text)-1):
                for k in range(j+1, min(len(th_text), j+36)):
                    if th_text[j:k] in self.stop_th:
                        # found keyword +++ instead of returning string - return positions that is
                        # i to j
                        if j <= stop_pos[-1][1]:
                            stop_pos[-1] = [stop_pos[-1][0], k]
                        else:
                            stop_pos.append([j, k])
                        break
            newstr = ''
            if len(stop_pos) == 1:
                newstr = th_text
            else:
                for j in range(len(stop_pos)-1):
                    newstr += th_text[stop_pos[j][1]:stop_pos[j+1][0]] + ' '
            return newstr

        text = text.replace(u'\u0e46', ' ')
        text = self.pattern_email.sub(' ', text)
        text = self.pattern_url.sub(' ', text)
        text = self.pattern_phone_number.sub(' ', text)
        text = self.pattern_thai_name.sub(' ', text)
        text = split_th_en(text)
        text = self.pattern_new_sentence.sub(' . ', text)
        text = text.replace('.', ' . ')
        text = validate_char(text)
        text = remove_thai_stop(text)

        text_split = text.split(' ')
        text_split = [item for item in text_split[:] if item not in self.stop_en
                      and not self.pattern_num_bullet.search(item)]
        text_split = [self.stemming.stem(item) if self.pattern_eng_token.search(item) and
                                                  item not in self.keyword else item for item in text_split[:]]
        text = '|'.join(text_split)

        return text
Ejemplo n.º 54
0
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
from nltk.stem import *
from nltk.stem.snowball import EnglishStemmer

with open('test.txt', 'r') as f:
    f_contents = f.read().decode("utf-8-sig").encode(
        "utf-8")  #decode the contents to unicode and encode to utf-8
    words = word_tokenize(f_contents)
    stemmer = PorterStemmer()
    #for w in words:
    #   print(stemmer.stem(w))
    print stemmer.stem('having')

    stemmer2 = SnowballStemmer('english')
    print stemmer2.stem('distribution')

    stemmer3 = EnglishStemmer()
    print stemmer3.stem('require')
Ejemplo n.º 55
0
class VocabProcessor:
    def __init__(self,
                 tokenizer_fn,
                 batch_size,
                 remove_stop_words,
                 should_stem,
                 limit_vocab=True,
                 max_vocab_size=80000):
        self.vocab = defaultdict(
            self.next_value
        )  # map tokens to ids. Automatically gets next id when needed
        self.token_counter = Counter()  # Counts the token frequency
        self.vocab[PAD] = globals.PAD_ID
        self.vocab[UNK] = globals.UNK_ID
        self.vocab[START] = globals.START_ID
        self.vocab[EOS] = globals.END_ID
        self.next = globals.END_ID  # After 3 comes 4
        self.tokenizer = tokenizer_fn
        self.reverse_vocab = {}

        self.batch_size = batch_size
        self.remove_stop_words = remove_stop_words
        self.should_stem = should_stem
        self.limit_vocab = limit_vocab
        self.max_vocab_size = max_vocab_size

        if remove_stop_words:
            remove_words = stopwords.words("english") + list(punctuation)
            # not included in punctuation for some reason
            remove_words.append("+")
            remove_words.append("``")
            remove_words.append("''")
            remove_words.append("'s")
            self.remove_word_set = set(remove_words)
            # adding punctuation from the same flag
        if self.should_stem:
            self.stemmer = EnglishStemmer()

    """Gets the next index for defaultdict"""

    def next_value(self):
        self.next += 1
        return self.next

    def reset_processor(self):
        self.vocab = defaultdict(
            self.next_value
        )  # map tokens to ids. Automatically gets next id when needed
        self.token_counter = Counter()  # Counts the token frequency
        self.vocab[PAD] = globals.PAD_ID
        self.vocab[UNK] = globals.UNK_ID
        self.vocab[START] = globals.START_ID
        self.vocab[EOS] = globals.END_ID
        self.next = globals.END_ID  # After 3 comes 4
        self.reverse_vocab = {}

    """Convert the ids back to strings."""

    def ids_to_string(self, tokens, length=None):
        string = ''.join([self.reverse_vocab[x] for x in tokens[:length]])
        return string

    def convert_token_to_id(self, token):
        '''
      Gets a token, looks it up in the vocabulary. If it doesn't exist in the vocab, it gets added to id with an id
      Then we return the id
      :param token:
      :return: the token id in the vocab
      '''
        if self.limit_vocab:
            if token in self.vocab:
                self.token_counter[token] += 1
                return self.vocab[token]
            else:
                if self.next < self.max_vocab_size:
                    self.token_counter[token] += 1
                    return self.vocab[token]
                else:
                    self.token_counter[UNK] += 1
                    return self.vocab[UNK]
        else:
            self.token_counter[token] += 1
            return self.vocab[token]

    # does more than just tokenization
    def tokenize(self, text):
        words = self.tokenizer(text)

        # 7-17-18 1:12 PM Testing with lowercase all words, works, switched to this!
        if self.remove_stop_words:
            if globals.VOCAB_LOWERCASE:
                words = [
                    word.lower() for word in words
                    if word.lower() not in self.remove_word_set
                ]
            else:
                words = [
                    word for word in words
                    if word.lower() not in self.remove_word_set
                ]
        if self.should_stem:
            words = [self.stemmer.stem(word) for word in words]
        return words

    def tokens_to_id_list(self, tokens):
        return list(map(self.convert_token_to_id, tokens))

    # def sentence_to_id_list(self, sent):
    # tokens = self.sentence_to_tokens(sent)
    # id_list = self.tokens_to_id_list(tokens)
    # return id_list

    # def sentence_to_numpy_array(self, sent):
    # id_list = self.sentence_to_id_list(sent)
    # return np.array(id_list)

    # All used to map back to vocab, not tested fully or used

    # def update_reverse_vocab(self):
    # self.reverse_vocab = {id_: token for token, id_ in self.vocab.items()}

    # def id_list_to_text(self, id_list):
    # tokens = ''.join(map(lambda x: self.reverse_vocab[x], id_list))
    # return tokens

    # Untested
    def save(self, filename):
        """Saves vocabulary processor into given file.
    Args:
      filename: Path to output file.
    """
        with gfile.Open(filename, 'wb') as f:
            f.write(pickle.dumps(self))

    @classmethod
    def restore(cls, filename):
        """Restores vocabulary processor from given file.
    Args:
      filename: Path to file to load from.
    Returns:
      VocabularyProcessor object.
    """
        with gfile.Open(filename, 'rb') as f:
            return pickle.loads(f.read())
Ejemplo n.º 56
0
                    #feat_tf_map = str(feat)+':'+str(tf)

                    f.write(feat_tf_idf_map)
                    f.write(' ')
                    doc_list[class_label].append(feat_tf_idf_map)
                    #doc_list[k].append(feat_tf_map)
            f.write('\n')


#main method
if __name__ == '__main__':
    
    #instantiate class 'Index'
    index = Index(nltk.word_tokenize, 
              EnglishStemmer(), 
              nltk.corpus.stopwords.words('english'))
    inv_index = index.indexed_docs()   #saves the inverted index into a variable

    #reads arguments from command line
    dir_newsgroups_data = sys.argv[1]    #reads directory of newsgroups data(which is the root directory mini_newsgroup)
    subject_body_index(dir_newsgroups_data)

    feature_defn_file = sys.argv[2]    #argument name to use to write feature definition file
    feature_defn_gen(feature_defn_file)
    print('Produced feature definition file')

    class_defn_file = sys.argv[3]    #argument name to use to write class definition file
    class_defn_gen(class_defn_file,dir_newsgroups_data)
    print('Produced class definition file')
Ejemplo n.º 57
0
global_path = "C:\\Users\\surat_000\\Documents\\Visual Studio 2013\\Projects\\searchDB_CS\\searchDB_CS\\bin\\Debug\\"
global_path = global_path + "text-data-G06F-20000-1\\"
train_path = global_path + "train\\"
test_path = global_path + "test\\"

#stop_words_list = ['it', 'a', 'is', ]

#Trying to use PorterStemmer
import nltk
from nltk import word_tokenize
from nltk.stem.porter import PorterStemmer
from nltk.stem.snowball import EnglishStemmer

#######
# based on http://www.cs.duke.edu/courses/spring14/compsci290/assignments/lab02.html
stemmer = EnglishStemmer()


def stem_tokens(tokens, stemmer):
    stemmed = []
    for item in tokens:
        stemmed.append(stemmer.stem(item))
    return stemmed


import string
import unicodedata
import re


def tokenize(text):
def stemmed_words(doc):
    stemmer = EnglishStemmer()
    analyzer = TfidfVectorizer().build_analyzer()
    return ' '.join((stemmer.stem(w) for w in analyzer(doc)))
Ejemplo n.º 59
0
def tweet_tokenizer(text, stemmer = EnglishStemmer()):
	stemmed_text = [stemmer.stem(word) for word in TweetTokenizer(preserve_case = False, strip_handles=True, reduce_len=True).tokenize(text)]
	return stemmed_text
Ejemplo n.º 60
0
dataloc = '/Volumes/Seagate Backup Plus Drive/PoliTweet/TwitterData/'
import emoji
# Sci-Kit Learn
from sklearn.metrics import accuracy_score, confusion_matrix, precision_score, recall_score, f1_score


# ### Required Data

# In[7]:

twtokenizer = TweetTokenizer()


# In[ ]:

stemmer = EnglishStemmer() # Get an instance of SnowballStemmer for English


# In[8]:

punctuation = list(set(string.punctuation)) + ['…','’','...','—',':/','”','..', '“']


# In[9]:

stopwords = ['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', 'your', 'yours',
'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', 'her', 'hers',
'herself', 'it', 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves',
'what', 'which', 'who', 'whom', 'this', 'that', 'these', 'those', 'am', 'is', 'are',
'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does',
'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until',