Python stem Examples, stemming.porter2.stem Python Examples

Example #1

0

Show file

File: picker.py Project: shankark10n/ecotrends

def ngram_in_collection(ngram, coll):
    """
    Check if ngram's components are in collection
    """
    s1 = set([stem(word) for word in ngram.split(' ')])
    s2 = set([stem(word) for word in coll])
    return (len(s1.intersection(s2)) > 0)

Example #2

0

Show file

File: tokenizer.py Project: dydt/dialectgaussmix

 def tokenize(self):
     punc = """\\.!?,(){}[]"'"""
     wordarray = []
     for c in self.document.lower().split():
         if stem(c.strip()) not in self.corpus.stopwords:
             wordarray.append(stem(c.strip(punc)))
     return wordarray

Example #3

0

Show file

File: naiveBayes.py Project: JasperHG90/naiveBayes-guardian-articles

def tokenize_porter(title, body):
    """ Break text into words and stem user porter stemmer """
    # break up words & remove stopwords
    title_break = stopWords(nltk.word_tokenize(title), lower_case=True)
    body_break = stopWords(nltk.word_tokenize(body), lower_case=True)
    # print title_break
    return ["title:" + stem(title) for title in title_break] + ["body:" + stem(body) for body in body_break]

Example #4

0

Show file

File: algo2.py Project: farazbhinder/article-summarization

def makeFreqDictionaryOfSentenceWords(s1):
	words1 = s1.split();
	dt1 = {}
	for w in words1:
		if w.lower() not in stopwords:
			dt1[stem(w.lower())] = dt1.get(stem(w.lower()),0) + 1
	return dt1

Example #5

0

Show file

File: kolotri.py Project: Ogleiv/IWI

def find_collocations_tri(filename):
    text_file = open(filename, 'r')

    most_common_words = find_most_common_words(text_file, 100)

    second_word = None
    third_word = None
    fourth_word = None
    collocations = dict()

    text_file.seek(0)
    for line in text_file:
        for word in line.split():
            first_word = second_word
            second_word = third_word
            third_word = fourth_word
            fourth_word = trim_word(word)
            if (first_word not in most_common_words and second_word not in most_common_words and third_word not in most_common_words) and \
                    (first_word and first_word[0].islower() and second_word and second_word[0].islower() and third_word and third_word[0].islower()):
                count_collocations_tri(collocations, stem(first_word.lower()), stem(second_word.lower()), stem(third_word.lower()))

     #dodatkowa iteracja dla ostatniego slowa
    first_word = second_word
    second_word = third_word
    third_word = fourth_word                                   
    count_collocations_tri(collocations, first_word, second_word, third_word)
    sort_collocations_tri(collocations)

Example #6

0

Show file

File: stats.py Project: mcka1n/dissertation

    def get_pmi(self, word0, word1):
        """Return the pointwise mutual information, a measure of word
        association within a window, for two words. This is normalized
        using Bouma (2009) to avoid infinite values for OOV terms.
        """
        word0 = word0.lower()
        word1 = word1.lower()

        if self.stemming:
            word0 = porter2.stem(word0)
            word1 = porter2.stem(word1)

        if word0 not in self.word_counts or word1 not in self.word_counts:
            return -1

        if word0 < word1:
            pair_counts = self.word_pair_counts[word0][word1]
        else:
            pair_counts = self.word_pair_counts[word0][word1]

        if pair_counts == 0:
            return -1

        num_words = self.word_counts[anyword]

        # TODO: confirm normalization. Currently assuming words are
        # normalized by num_words and pairs by num_words^2.
        ratio = pair_counts / (self.word_counts[word0] *
                               self.word_counts[word1])
        pmi = np.log(ratio)
        normalized_pmi = - pmi / np.log(pair_counts / (num_words * num_words))

        return normalized_pmi

Example #7

0

Show file

File: time_keyword_distribution.py Project: JingqingZ/AminerKnowledgeGraph

 def read(self, publication_keyword, publication_data):
     words = open(publication_keyword, 'r').readlines()
     for i in range(0, self.topic_number):
         s = stem(words[i].split('\t')[0])
         self.topics[ s ] = dict()
         self.stemword_dict[s] = words[i].split('\t')[0]
     content = open(publication_data, 'r').readlines()
     counter = 0
     year = ''
     for i in content:
         # three line represents a publication
         if counter % 3000 == 0:
             print (counter / 3)
         # record the year of this publication
         if counter % 4 == 1:
             year = int(i.strip())
         # parse the keywords of this publication
         elif counter % 4 == 3:
             keywords = i.strip().split(' ')
             for j in keywords:
                 j = stem(j)
                 if j in self.topics:
                     if year in self.topics[j]:
                         self.topics[j][year] += 1
                     else:
                         self.topics[j][year] = 1
         counter = counter + 1

Example #8

0

Show file

def freq(text,index):
	text = text.strip()
	textList = re.split('\W+',text)
	if len(textList) > 1:
		textList = [stem(word) for word in textList]
		setList = list()
		length = len(textList)-1
		for word in textList:

			if index.has_key(word)==False:
				print 0
				return
			wordSet = { (tuples[0], tuples[1]+length) for tuples in index[word]}
			setList.append(wordSet)
			length-=1
		docNum= setList[0]
		for Docset in setList:
			docNum = docNum & Docset
		print len(docNum)

	else:
		text = stem(textList[0])
		if index.has_key(text)==False:
			print 0
			return
		print len(index[text])

Example #9

0

Show file

File: score.py Project: vaibhavty/InformationRetrieval

def calculateScore(query,qID):
	

	sfile=open('../AP_DATA/stoplist.txt','r')
	sList=sfile.read().split('\n')
	query=query.lower()
	qList=re.findall("\w+[\.?\w+]*",query)
	temp=list()
	for term in qList:
		if term.endswith('.') & term.count('.')==1 & (len(term)>1):
			term=term.replace('.','')
		if term.startswith('_') & term.count('_') ==1 & (len(term)>1):
			term = term.replace('_','')
		temp.append(term)
	
	qList = temp
	#print index_num
	if index_num=='4':
		#print 123
		qList=[i for i in temp if i not in sList]
		temp=list() 
		for term in qList:
			term=stem(term)
			temp.append(term)
		qList=temp

	if index_num=='3':
		temp=list()
		for term in qList:
			term=stem(term)
			temp.append(term)
		qList=temp

	if index_num=='2':
		qList=[i for i in temp if i not in sList]

Example #10

0

Show file

File: collocations_file.py Project: Ogleiv/IWI

def find_collocations(file_name, data, popular_word):
    text_file = open(file_name, 'r')
    file_content = text_file.read()

    most_common_words = find_most_common_words(file_content, popular_word)

    second_word = None
    third_word = None
    collocations = data

    text_file.seek(0)
    for line in text_file:
        for word in line.split():
            first_word = second_word
            second_word = third_word
            third_word = trim_word(word)
            if (first_word not in most_common_words and second_word not in most_common_words) and \
                    (first_word and first_word[0].islower() and second_word and second_word[0].islower()):
                count_collocations(collocations, stem(first_word.lower()), stem(second_word.lower()))

    # dodatkowa iteracja dla ostatniego slowa
    first_word = second_word
    second_word = third_word
    count_collocations(collocations, first_word, second_word)

    collocations = find_whole_collocations_from_stems(collocations, file_content)
    return collocations, most_common_words, file_content

Example #11

0

Show file

File: find_full_text_sentence.py Project: johnbachman/indra

    def sentence_matches(self, sentence_text):
        """Returns true iff the sentence contains this mention's upstream
        and downstream participants, and if one of the stemmed verbs in
        the sentence is the same as the stemmed action type."""
        has_upstream = False
        has_downstream = False
        has_verb = False

        # Get the first word of the action type and assume this is the verb
        # (Ex. get depends for depends on)
        actiontype_words = word_tokenize(self.mention.actiontype)
        actiontype_verb_stemmed = stem(actiontype_words[0])

        words = word_tokenize(sentence_text)

        if self.string_matches_sans_whitespace(sentence_text.lower(),
            self.mention.upstream.lower()):
            has_upstream = True

        if self.string_matches_sans_whitespace(sentence_text.lower(),
            self.mention.downstream.lower()):
            has_downstream = True

        for word in words:
            if actiontype_verb_stemmed == stem(word):
                has_verb = True

        return has_upstream and has_downstream and has_verb

Example #12

0

Show file

File: collocations_wikipedia_penta.py Project: Ogleiv/IWI

def find_collocations_penta(text, data, popular_word):
    
    most_common_words = find_most_common_words(text, popular_word)

    second_word = None
    third_word = None
    fourth_word = None
    fifth_word = None
    sixth_word = None
    collocations = data

    for word in text.split():
        first_word = second_word
        second_word = third_word
        third_word = fourth_word
        fourth_word = fifth_word
        fifth_word = sixth_word
        sixth_word = trim_word(word)
        if (first_word not in most_common_words and second_word not in most_common_words and third_word not in most_common_words and fourth_word not in most_common_words and fifth_word not in most_common_words) and \
                (first_word and first_word[0].islower() and second_word and second_word[0].islower() and third_word and third_word[0].islower() and fourth_word and fourth_word[0].islower() and fifth_word and fifth_word[0].islower() ):
            count_collocations_penta(collocations, stem(first_word.lower()), stem(second_word.lower()), stem(third_word.lower()), stem(fourth_word.lower()), stem(fifth_word.lower()))

    #dodatkowa iteracja dla ostatniego slowa
    first_word = second_word
    second_word = third_word
    third_word = fourth_word
    fourth_word = fifth_word
    fifth_word = sixth_word
    count_collocations_penta(collocations, first_word, second_word, third_word, fourth_word, fifth_word)
    return collocations, most_common_words

Example #13

0

Show file

File: cleanTweets.py Project: atran3/sarcasm_detection

def cleanText(text, entities, category):
	cleanText = text
	hashtags = entities.get('hashtags', [])
	ranges = []
	for hashtag in hashtags:
		if hashtag.get('text', '').lower() == category:
			indices = hashtag.get('indices')
			ranges.append(indices)
	urls = entities.get('urls', [])
	urls.reverse()
	ranges.extend([v for url in urls for k,v in url.iteritems() if k == 'indices'])
	media = entities.get('media', [])
	media.reverse()
	ranges.extend([v for medium in media for k,v in medium.iteritems() if k == 'indices'])
	ranges = sorted(ranges, key=lambda x: x[0], reverse=True)
	for r in ranges:
		cleanText = cleanText[:r[0]] + cleanText[r[1] + 1:]

	category_stem = stem(category).lower()
	cleanTextList = cleanText.split(' ')
	cleanText = []
	for word in cleanTextList:
		if category_stem not in stem(word).lower() and stem(word).lower() not in category_stem:
			cleanText.append(word)
	cleanText = " ".join(cleanText)
	return cleanText

Example #14

0

Show file

File: knock52.py Project: m-note/100knock2015

def main():
    nlp_file = open(sys.argv[1], "r")
    for line in nlp_file:
        words = line.strip().split(" ")
        for word in words:
            print stem(word)

    nlp_file.close()

Example #15

0

Show file

File: feature_hy.py Project: yanyankangkang/Text-Mining

def getVocabularyStem(content):
    vocabulary = {}
    index = 0
    for i in range(len(content)):
        if stem(content[i]) not in vocabulary:
            vocabulary[stem(content[i])] = index
            index = index + 1
    return vocabulary

Example #16

0

Show file

File: ir.py Project: imsorry1121/paper_label

def getSentTf(sent, stopwords):
	doc = dict()
	for word in re.split("[^a-zA-Z0-9]", sent):
		word = word.lower()
		if word != "" and word!="'" and stem(word) not in stopwords:
			if doc.get(stem(word), 0) == 0:
				doc[stem(word)] = 1
			else:
				doc[stem(word)] = doc[stem(word)]+1
	return doc

Example #17

0

Show file

File: generate_view.py Project: JingqingZ/AminerKnowledgeGraph

 def filter(self):
     # do not generate html file, just filter the correct relationships
     correct_list = list()
     for i in range(0, len(self.linklist)):
         key0 = stem(self.linklist[i][0])
         key1 = stem(self.linklist[i][1])
         if self.judge(key0, key1, i) is False:
             continue
         correct_list.append(i)
     return correct_list

Example #18

0

Show file

File: englishsum.py Project: CLC-HCMUS/vts

def get_word_embedding(word, w2vmodel):
    if word in w2vmodel:
        return w2vmodel[word]
    elif stem(word) in w2vmodel:
        return w2vmodel[stem(word)]
    elif word.lower() in w2vmodel:
        return w2vmodel[word.lower()]
    elif stem(word.lower()) in w2vmodel:
        return w2vmodel[stem(word.lower())]
    else:
        return None

Example #19

0

Show file

File: stop_words.py Project: sahilkumarmaths/IR

def form_regex_for_common_words():
    expr = ""
    count = 0
    common_words = fp.read().split()
    for word in common_words:
        count+= 1
        if count == len(common_words):
            expr += "^"+stem(word)+"$"
        else:
            expr += "^"+stem(word)+"$|"
    return expr

Example #20

0

Show file

File: summarize_from_db.py Project: giahy2507/multsum

def naive_wc_sim(str1, str2):
  list1 = nltk.word_tokenize(str1)
  list2 = nltk.word_tokenize(str2)
  count = 0
  for w1 in list1:
    stw1 = stem(w1)
    for w2 in list2:
      stw2 = stem(w2)
      if stw1 == stw2:
        count += 1
  return (1.0*count)/(1.0*min(len(list1), len(list2)))

Example #21

0

Show file

File: ir.py Project: imsorry1121/tdt

def getDocTf(fileName, stopwords):
	doc = dict()
	with open(fileName, "r") as fi:
		for line in fi:
			for word in re.split("[^a-zA-Z0-9]", line.strip()):
				word = word.lower()
				if word != "" and word!="'" and stem(word) not in stopwords:
					if doc.get(stem(word), 0) == 0:
						doc[stem(word)] = 1
					else:
						doc[stem(word)] = doc[stem(word)]+1
	return doc

Example #22

0

Show file

File: run_ir_baseline.py Project: gangeli/NaturalLI

def overlapMeasure(strA, strB, stopwords):
  # Split and lowercase tokens

  tokA = [x.lower() for x in strA.split(' ') if x != 'what' and x != 'why' and x != 'how']
  tokB = [x.lower() for x in strB.split(' ') if x != 'what' and x != 'why' and x != 'how']
  try:
    from stemming.porter2 import stem
    tokA = [stem(x) for x in tokA]
    tokB = [stem(x) for x in tokB]
  except:
    pass
  overlap = naiveOverlap(tokA, tokB, stopwords)
  return overlap

Example #23

0

Show file

File: pipelines.py Project: kazemnejad/gosearch

    def process_item(self, item, spider):
        url = item["url"]
        title = item["title"]
        main = item["content"]

        title = re.findall(r'[A-Za-z0-9]\w*', title.lower())
        main = re.findall(r'[A-Za-z0-9]\w*', main.lower())

        for i in range(len(main)):
            main[i] = stem(main[i])
        for i in range(len(title)):
            title[i] = stem(title[i])
        delWord = dict(nltk.pos_tag(main))
        for i in delWord:
            if delWord[i] == 'DT' or delWord[i] == 'IN' or delWord[i] == 'CC' or delWord[i] == 'TO':
                for j in range(main.count(i)):
                    main.remove(i)

        delWord = dict(nltk.pos_tag(title))
        for i in delWord:
            if delWord[i] == 'DT' or delWord[i] == 'IN' or delWord[i] == 'CC' or delWord[i] == 'TO':
                for j in range(title.count(i)):
                    title.remove(i)

        new_main = main + title
        main_pos = {}
        for i in range(len(new_main)):
            if main_pos.get(new_main[i], 0) == 0:
                main_pos[new_main[i]] = [i]
            else:
                main_pos[new_main[i]].append(i)
        main = Counter(main)
        title = Counter(title)

        for i in title:
            title[i] *= 2
        for i in title:
            title[i] = max(title[i], main.get(i, 0)) * 2 + min(title[i], main.get(i, 0))
            main[i] = 0
        main.update(title)

        return {
            "url": url,
            "title": item["title"],
            "content": item["content"],
            "words": main,
            "wordspos": main_pos
        }

Example #24

0

Show file

File: explore.py Project: jbowens/taboo

def visit(word, depth):
    if depth > max_depth:
        return
    if word in visited:
        return

    word_stem = stem(word)

    visited.add(word)

    if not word in freqs:
        freqs[word] = dict()
        stems[word] = dict()

    text = DictionaryServices.DCSCopyTextDefinition(None, word, (0, len(word)))
    if not text or len(text) == 0:
        return

    # We don't care about any of the origin/etymology data, so remove it
    text = text.split('ORIGIN')[0]
    # Remove any punctuation, weird characters, etc.
    filtered_text = re.sub(r'[\W\d]+', ' ', text).lower()
    words = filtered_text.split()

    for w in words:
        w_stem = stem(w)
        if w != word and len(w) >= min_word_len and w_stem != word_stem:
            if not w in freqs:
                freqs[w] = dict()
                stems[w] = dict()

            if w_stem not in stems[word]:
                freqs[word][w] = 1 if w not in freqs[word] else freqs[word][w] + 1
                stems[word][w_stem] = w
            else:
                same_stem = stems[word][w_stem]
                freqs[word][same_stem] = freqs[word][same_stem] + 1

            if word_stem not in stems[w]:
                freqs[w][word] = 1 if word not in freqs[w] else freqs[w][word] + 1
                stems[w][word_stem] = word
            else:
                same_stem = stems[w][word_stem]
                freqs[w][same_stem] = freqs[w][same_stem] + 1

            unigram_freqs[w] = 1 if w not in unigram_freqs else unigram_freqs[w] + 1

            visit(w, depth + 1)

Example #25

0

Show file

File: okapi_tf_idf.py Project: tushar2702/InformationRetrieval

def processQuery():
    stopwords = set()
    stopfile = open("stoplist.txt")
    for stopword in stopfile:
        stopwords.add(stopword.rstrip())
    
    queries = loadQueries()
    for queryString in queries.keys():
        
        query = queries[queryString];
        
        # handle dots (".") U.S. becomes US
        query = ''.join(e for e in query if e != '.')
        
        # remove punctuation
        query = re.sub('[^a-zA-Z0-9\n\.]', ' ', query).rstrip()
        
        # remove stop words
        result = query.split(" ")
        mystr = '';
        for term in result:
            if term == '' or term == ' ': continue
            if term not in stopwords:
                # convert to lower case
                term = str(term).lower()
                term = str(stem(term))
                mystr += term + ' '
        queriesdict[queryString] = mystr.rstrip()

Example #26

0

Show file

File: learningAlgorithm.py Project: jmueller1644/410Final

def ToVS(text):
    VS=dict()
    text=text.lower()
    
    VS["#!"]=len(re.findall("!",text))
    VS["#?"]=len(re.findall("\\?",text))
    VS["#()"]=len(re.findall("\\(|\\)",text))
    VS["#numbers"]=len(re.findall("\\d+",text))
    VS["##"]=len(re.findall("#",text))
    VS["#{}"]=len(re.findall("\\{|\\}",text))
    VS["#[]"]=len(re.findall("\\[|\\]",text))
    VS["#comparison"]=len(re.findall("<|>|=",text))
    VS['#"']=len(re.findall('"',text))
    VS['#math']=len(re.findall('\\+|\\-|\\*|\\/',text))
    VS['#_']=len(re.findall('_',text))
    VS['#oneCharacter']=0
    for c in string.punctuation:
        if c!="-" and c!="_":
            text=text.replace(c," ")
        else:
            text=text.replace(c,"")
    text=re.sub("[\\s|\\d]+"," ",text)
    text=filter(lambda x: x in string.printable, text)
    for w in text.split(" "):
        word=stem(w)
        if word!="":
            if len(word)==1:
                VS["#oneCharacter"]+=1
            elif VS.has_key(word):
                VS[word]+=1
            else:
                VS[word]=1
    return VS

Example #27

0

Show file

File: TF_IDF_Classify.py Project: shikha1990/old_healthmonitoring

    def __clean__(self, text):
        """
        Clean up a document through stemming and stop word removal.
        Stemming is the act of removing suffixes from a word to limit variation between verb tenses.
        Stop word remove is the act of removing common words from the document that likely play no meaning in the
        significance of the document.
        :param document: The document to be cleaned
        :return: The given document after stemming and stop word removal
        """
        text = re.sub("((http:|https:|ftp:|ftps:)//[\w$-_.+!*'(),%=]+)", '', text)
        text = re.sub("(@[\w_]+)", '', text)
        text = re.sub("(#[\w!$-_.+!*'(),%=]+)", '', text)
        text = re.sub("\p{P}+", '', text)
        text = re.sub("[\'\":#,!&]+", '', text)
        
        stopwords = ["a","a's","able","about","above","according","accordingly","across","actually","after","afterwards","again","against","ain't","all","allow","allows","almost","alone","along","already","also","although","always","am","among","amongst","an","and","another","any","anybody","anyhow","anyone","anything","anyway","anyways","anywhere","apart","appear","appreciate","appropriate","are","aren't","around","as","aside","ask","asking","associated","at","available","away","awfully","b","be","became","because","become","becomes","becoming","been","before","beforehand","behind","being","believe","below","beside","besides","best","better","between","beyond","both","brief","but","by","c","c'mon","c's","came","can","can't","cannot","cant","cause","causes","certain","certainly","changes","clearly","co","com","come","comes","concerning","consequently","consider","considering","contain","containing","contains","corresponding","could","couldn't","course","currently","d","definitely","described","despite","did","didn't","different","do","does","doesn't","doing","don't","done","down","downwards","during","e","each","edu","eg","eight","either","else","elsewhere","enough","entirely","especially","et","etc","even","ever","every","everybody","everyone","everything","everywhere","ex","exactly","example","except","f","far","few","fifth","first","five","followed","following","follows","for","former","formerly","forth","four","from","further","furthermore","g","get","gets","getting","given","gives","go","goes","going","gone","got","gotten","greetings","h","had","hadn't","happens","hardly","has","hasn't","have","haven't","having","he","he's","hello","help","hence","her","here","here's","hereafter","hereby","herein","hereupon","hers","herself","hi","him","himself","his","hither","hopefully","how","howbeit","however","i","i'd","i'll","i'm","i've","ie","if","ignored","immediate","in","inasmuch","inc","indeed","indicate","indicated","indicates","inner","insofar","instead","into","inward","is","isn't","it","it'd","it'll","it's","its","itself","j","just","k","keep","keeps","kept","know","known","knows","l","last","lately","later","latter","latterly","least","less","lest","let","let's","like","liked","likely","little","look","looking","looks","ltd","m","mainly","many","may","maybe","me","mean","meanwhile","merely","might","more","moreover","most","mostly","much","must","my","myself","n","name","namely","nd","near","nearly","necessary","need","needs","neither","never","nevertheless","new","next","nine","no","nobody","non","none","noone","nor","normally","not","nothing","novel","now","nowhere","o","obviously","of","off","often","oh","ok","okay","old","on","once","one","ones","only","onto","or","other","others","otherwise","ought","our","ours","ourselves","out","outside","over","overall","own","p","particular","particularly","per","perhaps","placed","please","plus","possible","presumably","probably","provides","q","que","quite","qv","r","rather","rd","re","really","reasonably","regarding","regardless","regards","relatively","respectively","right","s","said","same","saw","say","saying","says","second","secondly","see","seeing","seem","seemed","seeming","seems","seen","self","selves","sensible","sent","serious","seriously","seven","several","shall","she","should","shouldn't","since","six","so","some","somebody","somehow","someone","something","sometime","sometimes","somewhat","somewhere","soon","sorry","specified","specify","specifying","still","sub","such","sup","sure","t","t's","take","taken","tell","tends","th","than","thank","thanks","thanx","that","that's","thats","the","their","theirs","them","themselves","then","thence","there","there's","thereafter","thereby","therefore","therein","theres","thereupon","these","they","they'd","they'll","they're","they've","think","third","this","thorough","thoroughly","those","though","three","through","throughout","thru","thus","to","together","too","took","toward","towards","tried","tries","truly","try","trying","twice","two","u","un","under","unfortunately","unless","unlikely","until","unto","up","upon","us","use","used","useful","uses","using","usually","uucp","v","value","various","very","via","viz","vs","w","want","wants","was","wasn't","way","we","we'd","we'll","we're","we've","welcome","well","went","were","weren't","what","what's","whatever","when","whence","whenever","where","where's","whereafter","whereas","whereby","wherein","whereupon","wherever","whether","which","while","whither","who","who's","whoever","whole","whom","whose","why","will","willing","wish","with","within","without","won't","wonder","would","wouldn't","x","y","yes","yet","you","you'd","you'll","you're","you've","your","yours","yourself","yourselves","z","zero"]
        sb = []
        text = text.lower()
        re.sub(r'[\W_]+', '', text)
        for term in text.split():
            term = stem(term)
            if term not in stopwords:
                sb.append(term)

        return ' '.join(sb)

Example #28

0

Show file

File: solution_03.py Project: rave78/MIR

def index(urls):
    """
    Goal:  Download a list of webpage
    
    Parameter:
    urls:  list of strings, which represent the address of each webpage 
    
    """    
    
    if not os.path.isdir('files'):
        os.makedirs('files')
    
    
    for webpage in urls:
        name = webpage.split('/')[-1]
        os.system("wget "+webpage+ " -q -O files/"+name)
        logging.info("Downloaded: "+ name )
        
    b_o_w = {}
    
    for web_file in os.listdir('files'):
        
        try:
            text_html = open('files/'+web_file,'r').read();
            text = [stem(word.lower()) for word in html2text(text_html).split()]
            b_o_w[web_file] = text
            logging.info("Tokenized: "+web_file)
        except :
            #Something strange happened with the webpage of New_York_City
            print ("There is a problem with "+web_file)
    
    index_file = open("index_file.pck", "w") 
    pickle.dump(b_o_w, index_file)
    index_file.close()

Example #29

0

Show file

File: englishsum.py Project: CLC-HCMUS/vts

def get_trcmparer_sim(origin_sentences):

    flat_sentences = []
    stopwords = get_stopwords("english_stopwords.txt")
    for sentence in origin_sentences:
        sent_tmp = []
        for word in sentence:
            if word.isalnum():
                word = word.lower()
                if word not in stopwords:
                    stemmed = stem(word)
                    sent_tmp.append(stemmed)
        flat_sentences.append(sent_tmp)
    # print len(flat_sentences)

    trcmp_matrix = np.zeros((len(flat_sentences), len(flat_sentences)),dtype=np.float32)
    for i in range(0,len(flat_sentences)):
        for j in range(i+1,len(flat_sentences)):
            if len(flat_sentences[i]) == 0 or  len(flat_sentences[j]) == 0:
                continue
            intersection_word = intersectionSet(flat_sentences[i],flat_sentences[j])
            trcmp_matrix[i][j] = (len(intersection_word)*1.0)/(np.log(len(flat_sentences[i])+1)+np.log(len(flat_sentences[j])+1))
            trcmp_matrix[j][i] = trcmp_matrix[i][j]
    trcmp_matrix = scale_0_1(trcmp_matrix)
    return trcmp_matrix

Example #30

0

Show file

File: englishsum.py Project: CLC-HCMUS/vts

def get_sent_embedding_w2v(sent, w2vmodel, mode, tfidf_vectorizer = None):
    result_vec = []
    set_words = set(sent)

    for w_idx,word in enumerate(set_words):
        if word.isalnum() == False:
            continue
        word_vector = get_word_embedding(word,w2vmodel)
        if word_vector is None:
            # print word
            continue
        tf_word = sent.count(word)
        if tfidf_vectorizer is not None:
            stemmed = stem(word)
            if word in tfidf_vectorizer.vocabulary_:
                idf_word = tfidf_vectorizer.idf_[tfidf_vectorizer.vocabulary_[word]]
            elif stemmed in tfidf_vectorizer.vocabulary_:
                idf_word = tfidf_vectorizer.idf_[tfidf_vectorizer.vocabulary_[stemmed]]
            else:
                idf_word = 0
            tf_idf_word = tf_word * idf_word
            tf_idf_array = np.array([tf_idf_word])
            word_vector = np.concatenate((word_vector, tf_idf_array))
        result_vec.append(word_vector)
    result_vec = np.array(result_vec)
    if mode == "mean":
        return np.mean(result_vec,axis=0)
    else:
        return np.sum(result_vec,axis=0)

Example #31

0

Show file

File: Milestone4_HyunSooPark_JingZhan.py Project: zhanjing1214/KeywordExtractionProject

def stem_inventory(inventory):
    words = inventory.split(" ")
    stem_words = []
    for word in words:
        if "_" in word or '-' in word or "\\" in word: # if they are compound word, then don't stem
            stem_words.append(word)
        else: # if not stem
            stem_words.append(stem(word))
    return stem_words

Example #32

0

Show file

File: preprocess.py Project: cnxtech/hashsuggest

def removeStems(data):
	"""
	Purpose: 	Computes the stem of each word in the passed word list
	Returns:	A list containing a stem for each word in the words list
	"""
	stemList = []
	for d in data:
		stemList.append(stem(d))
	return stemList

Example #33

0

Show file

File: foodList.py Project: DaveGerson/SocialCorrespondenceAnalysis

def foodwordList(keyword):
    tweets = twitterImport.getTweets(keyword)
    tmpList = []
    for tweet in tweets['statuses']:
        tmpList.append([stem(word) for word in tweet['text'].split(" ")])
    finalList = []
    for list in tmpList:
        finalList = finalList + list
    return finalList

Example #34

0

Show file

File: knock76.py Project: tmu-nlp/100knock2017

def create_features(x):
    phi = defaultdict(lambda: 0)

#    words = x.split()
    for word in x:
        word = stem(word)
        phi["UNI:" + word] += 1

    return phi

Example #35

0

Show file

def search_fast(term: str, vocab: pd.DataFrame) -> list:
    stemmed_term = stem(term.lower())
    options = []
    for row in vocab[vocab["stemmed"].str.contains(stemmed_term)].itertuples(
            index=True):
        options.append((row.URI, row.Label))
        if len(options) >= 151:
            break
    return options

Example #36

0

Show file

def addTerm(line, curr_doc, stopWordsList):
    for term in line.split():
        term = term.lower()
        term = stem(term)
        if (term not in stopWordsList and len(term) > 3):
            try:
                curr_doc[term] += 1
            except KeyError:
                curr_doc[term] = 1

Example #37

0

Show file

def clean(doc):
    remove_digits = str.maketrans('', '', digits)
    doc = doc.translate(remove_digits)
    cleantext = BeautifulSoup(doc, "html.parser").text
    doc = re.sub(cleanr, ' ', doc)
    doc = doc.replace("<div>", " ")
    doc = doc.replace("</div>", " ")
    doc = doc.replace(".</div>", " ")
    doc = doc.replace("<br />", " ")
    doc = doc.replace(".", " ")
    doc = doc.replace(":", " ")
    doc = doc.replace(",", " ")
    doc = doc.replace("_", " ")
    doc = doc.replace('-', ' ')
    doc = doc.replace('(', ' ')
    doc = doc.replace(')', ' ')
    doc = doc.replace('#', ' ')
    doc = doc.replace('/', ' ')
    doc = doc.replace(" div ", " ")
    doc = doc.replace(" br ", " ")
    doc = doc.replace("nbsp", " ")
    doc = doc.replace("ndash", " ")
    doc = doc.replace("&rsquo;", ' ')
    doc = doc.replace("&trade;", ' ')
    doc = re.sub(r"\&([^;.]*);", " ", doc)
    doc = re.sub(r"([0-9]+)-([0-9]+)", " ", doc)
    doc = re.sub(r"\d", " ", doc)
    stop_free = " ".join([i for i in doc.lower().split() if i not in stop])
    punc_free = ''.join(ch for ch in stop_free if ch not in exclude)
    punc_free = re.sub(r"\b\d+\b", " ", punc_free)
    words = word_tokenize(punc_free)
    lemmatized_words = [lemma.lemmatize(word) for word in words]
    #stemmed_word = [stem(w) for w in lemmatized_words]
    #stemmed_word = [stem(w) for w in words]
    finallist = []
    for ch in lemmatized_words:
        if len(ch) > 2 and len(ch) < 13 and ch.encode('utf-8').isalnum(
        ) == True and bool(re.search(r'\d', ch)) == False:
            try:
                finallist.append(stem(vocab_mapper[ch]))
            except:
                finallist.append(stem(ch))
    final = " ".join(finallist)
    return final

Example #38

0

Show file

File: word_stemmer.py Project: subodhchhabra/hackor

def lyrics_to_bow(lyrics):
    """
    Main function to stem and create bag of words.
    It is what we used for the musiXmatch dataset.
    It is heavily oriented towards English lyrics, we apologize for that.
    INPUT
        lyrics as a string
    RETURN
        dictionary word -> count
        or None if something was wrong (e.g. not enough words)
    """
    # remove end of lines
    lyrics_flat = lyrics.replace('\r', '\n').replace('\n', ' ').lower()
    lyrics_flat = ' ' + lyrics_flat + ' '
    # special cases (English...)
    lyrics_flat = lyrics_flat.replace("'m ", " am ")
    lyrics_flat = lyrics_flat.replace("'re ", " are ")
    lyrics_flat = lyrics_flat.replace("'ve ", " have ")
    lyrics_flat = lyrics_flat.replace("'d ", " would ")
    lyrics_flat = lyrics_flat.replace("'ll ", " will ")
    lyrics_flat = lyrics_flat.replace(" he's ", " he is ")
    lyrics_flat = lyrics_flat.replace(" she's ", " she is ")
    lyrics_flat = lyrics_flat.replace(" it's ", " it is ")
    lyrics_flat = lyrics_flat.replace(" ain't ", " is not ")
    lyrics_flat = lyrics_flat.replace("n't ", " not ")
    lyrics_flat = lyrics_flat.replace("'s ", " ")
    # remove boring punctuation and weird signs
    punctuation = (',', "'", '"', ",", ';', ':', '.', '?', '!', '(', ')',
                   '{', '}', '/', '\\', '_', '|', '-', '@', '#', '*')
    for p in punctuation:
        lyrics_flat = lyrics_flat.replace(p, '')
    words = filter(lambda x: x.strip() != '', lyrics_flat.split(' '))
    # stem words
    words = map(lambda x: stem(x), words)
    bow = {}
    for w in words:
        if not w in bow.keys():
            bow[w] = 1
        else:
            bow[w] += 1
    # remove special words that are wrong
    fake_words = ('>', '<', 'outro~')
    bowwords = bow.keys()
    for bw in bowwords:
        if bw in fake_words:
            bow.pop(bw)
        elif bw.find(']') >= 0:
            bow.pop(bw)
        elif bw.find('[') >= 0:
            bow.pop(bw)
    # not big enough? remove instrumental ones among others
    if len(bow) <= 3:
        return None
    # done

    return bow

Example #39

0

Show file

def meaning_bag(pos, url):
    '''
    Return a "meaning bag" given a WordSmyth url. A meaning bag is a bag
    of stemmed words derived from a WordSmyth url of a definition page.

    pos - either "noun" or "verb"
    url - WordSmyth url (e.g. "https://www.wordsmyth.net/?level=3&ent=dog")
    '''

    # download page to file
    filename = wget.download(url, out='{}.html'.format(uuid.uuid4().hex))
    page = open(filename).read()
    soup = BeautifulSoup(page, 'html.parser')
    # delete file !!!
    os.remove(filename)

    bag = set()
    maintable = soup.tbody.find('table', class_="maintable")
    correct_pos = False
    for tr in maintable.tbody.findChildren():

        if tr.get("class") and tr.get("class")[0] == "postitle":
            if tr.find("td", class_="data").a:
                pos_ = tr.find("td", class_="data").a.text
            else:
                pos_ = tr.find("td", class_="data").text

            if pos in pos_.split():
                correct_pos = True
            else:
                correct_pos = False

        elif tr.get("class") and tr.get(
                "class")[0] == "definition" and correct_pos:
            def_ = tr.find("td", attrs={
                'class': 'data'
            }).find_all(text=True, recursive=False)[0]
            # update bag with words from definition
            bag.update([t.lower() for t in tokenizer.tokenize(def_)])
            # check for "similar words"
            if tr.find("td", attrs={'class': 'data'}).find("dl"):
                sim_words = tr.find("td", attrs={'class': 'data'}).dl.dd.a.text
                # update bag with words from "related words" section
                bag.update([t.lower() for t in tokenizer.tokenize(sim_words)])

        elif tr.get("class") and tr.get(
                "class")[0] == "related_word" and correct_pos:
            rel_words = tr.find("td", class_="data").a.text
            # update bag with words from "related words" section
            bag.update([t.lower() for t in tokenizer.tokenize(rel_words)])

    # remove stopwords
    bag -= STOPWORDS
    # stem words
    bag = set([stem(w) for w in bag])
    return (bag)

Example #40

0

Show file

	def document_to_query(self, doc):
		""" Given a document it transforms the source code related fields to a lucene query string """
		query = ""
		for field in ["description"]:
			for val in doc.getFields(field):
				if val.stringValue().strip():
					term = QueryParser.escape(val.stringValue())
					# tokenize
					term = self.tokenize_string(StandardAnalyzer(), term)
					# CamelCase
					temp = []
					for t in term:
						temp += self.camel_case_split(t)
					# stopwords
					temp_2 = []

					for t in temp:
						if t not in english_stop_words:
							temp_2.append(t)
					# stemming
					temp_3 = []
					for t in temp_2:
						temp_3.append(stem(t))
					# stopwords
					temp_4 = []

					for t in temp_3:
						if t not in english_stop_words:
							temp_4.append(t)
					# query generation
					for term in temp_4:
						query += "%s:%s " % (field, term)

		for field in ["typed_method_call", "methods", "used_classes", "class_instance_creation", "methods_called",
					  "annotations", "literals"]:  # "used_classes", , "literals" , "extends"
			for val in doc.getFields(field):
				if val.stringValue().strip():
					term = QueryParser.escape(val.stringValue())
					java_stoplist = ["java.lang.Object", 'void', 'Global', 'boolean', 'String', 'int', 'char', 'float',
									 'double', 'write', 'close', 'from', 'println', 'StringBuilder', 'write',
									 'toString',
									 'close', 'mkdir', 'exists']

					if term not in java_stoplist:
						query += "%s:%s " % (field, term)

		if len(doc.getFields("code_hints")) > 0:
			hints = [hint.stringValue() for hint in doc.getFields("code_hints")]
			hints_str = " ".join(hints)
			for term in hints:
				if term:
					term = QueryParser.escape(term)
					if term not in english_stop_words:
						# print "Including 'code_hints' from Doc_To_Query TERMs... //", term
						query += "code_hints:%s " % term
		return query

Example #41

0

Show file

def processscript(filename):
    print '\n' + filename
    f = open(filename, 'r')

    s = f.read()

    s = s.replace('\\n', ' ')
    s = s.replace('\\t', ' ')
    s = re.sub(r'[^a-zA-Z]', r'\t', s)

    #print 'Tokenizing...'
    x = wordpunct_tokenize(s)
    tokenized = len(x)

    #print 'Removing words of length 1-2...'
    list = []
    for word in x:
        if len(word) > 2:
            list.append(word)
    remove12 = len(list)

    fin = {}

    #print 'Stemming...'
    for wd in list:
        tem = stemport.stem(wd)
        if tem in fin:
            fin[tem] = fin[tem] + 1
        else:
            fin[tem] = 1
    stemmed = len(fin)

    #print 'Removing stop words...'
    f = open('stop.txt', 'r')
    for line in f:
        for word in line.split():
            if (word in fin):
                #print word
                del fin[word]
    #print fin
    stopped = len(fin)

    print 'tokenized:'
    print tokenized

    print 'remove12:'
    print remove12

    print 'stemmed:'
    print stemmed

    print 'stopped:'
    print stopped

    str = ''.join('%s ' % (k) for k, v in fin.iteritems())
    return str

Example #42

0

Show file

File: split.py Project: rheehot/nlp_nmt

 def extract_stem(self, sentence):
     if self._language == 'ko':
         spaced = self._morphs(
             unicodedata.normalize('NFKC',
                                   sentence.strip()).translate(self._table))
     elif self._language == 'en':
         spaced = [
             stem(j) for j in self._morphs(self.normalize_string(sentence))
         ]
     return spaced

Example #43

0

Show file

File: assignment1.py Project: psilonpneuma/ZAN

def tokenise_stem(text):
    '''removes punctuation, lowers all the characters and returns a list of all the stemmed words split at space or newline'''
    punct = re.compile('[%s]' % re.escape(string.punctuation))
    dig = re.compile('\d')
    text_clean1 = punct.sub('', text)
    text_clean = dig.sub(
        '0', text_clean1)  #convert all digits to 0 to optimize indexing
    tokens = text_clean.split()
    stemmed_tokens = [stem(token) for token in tokens]
    return lower(stemmed_tokens)

Example #44

0

Show file

File: fingerprint.py Project: khzt2004/clustering_text

 def fp_steps(self, text):
     title = text.strip().lower()
     title_splchar_removed = self.remove_spl_char_regex.sub(" ", title)
     title_number_removed = self.remove_num.sub("", title_splchar_removed)
     words = title_number_removed.split()
     filter_stop_words = [
         w for w in words if not w in nltk.corpus.stopwords.words('english')
     ]
     stemed = [stem(w) for w in filter_stop_words]
     return sorted(stemed)

Example #45

0

Show file

File: retriever.py Project: amit2014/wiki_search

def get_pages(search_query):
    index = indexer.construct_index('indices/index_1.txt')
    search_query = nltk.word_tokenize(search_query)
    search_query = [stem(word.lower()) for word in search_query]
    print search_query
    pages = set(index.get(search_query[0]))
    for i in xrange(1, len(search_query)):
        word = search_query[i]
        pages = pages.intersection(set(index.get(word)))
    return list(pages)

Example #46

0

Show file

File: classifier.py Project: MMatlacz/quantulum

def clean_text(text: str) -> str:
    """Clean text for TFIDF."""
    new_text = re.sub(r'\p{P}+', ' ', text)

    new_text = [stem(i) for i in new_text.lower().split() if not
    re.findall(r'[0-9]', i)]

    new_text = ' '.join(new_text)

    return new_text

Example #47

0

Show file

File: engine.py Project: cash2one/generators

def normalize(sentence, bad_words=_bad_words):
    res = set()
    if isinstance(sentence, (str, unicode)):
        tokens = token_reg.split(sentence.lower().strip())
        for token in tokens:
            if len(token) > 2 and token not in bad_words:
                stemmed = stem(token)
                if stemmed not in stop_words_en:
                    res.add(token)
    return res

Example #48

0

Show file

File: 076.py Project: vhai19/100_knock

def extract_feature(sent):
    with open("./data/stop_words.txt", "r") as f:
        stop_words = [x.strip() for x in f]
    features = []
    for word in sent:
        if word in stop_words:
            continue
        else:
            features.append(stem(word.strip()))
    return features

Example #49

0

Show file

File: data_manipulation.py Project: jschulzz/rage-against-the-machine-learning

def encodeName(name, wordBag):
    encoding = [0 for word in wordBag]  # 0s for all characters
    # result.append(1/len(text))
    # result.append(1/len(text.split()))
    name = name.lower()
    tokens = word_tokenize(name)
    for t in tokens:
        root = stem(t)
        encoding[wordBag.index(root)] = 1
    return encoding

Example #50

0

Show file

File: knock72.py Project: losyer/100knock

def baseline(line, stop_list):
    word_ls = line.split()
    # ストップワード除去
    for word in word_ls:
        if check_stop_words(word):
            word_ls = filter(lambda a: a != word, word_ls)
    # ステミング
    for i in range(len(word_ls)):
        word_ls[i] = stem(word_ls[i].strip())
    return word_ls

Example #51

0

Show file

File: custom_classifier.py Project: afcarl/sentipy

    def clean_up(tweet):

        # Perform porter stemmer and remove any STOPWORDS

        tweet = ' '.join([word for word in tweet.split(' ')
                         if not word.startswith('#')
                         and not word.startswith('@')
                         and not word.startswith('http')
                         and not word.startswith('www')])

        tweet = to_alphanum(tweet).lower()
        tweet = tweet.split(' ')
        sw = set(STOPWORDS)  # Allows for O(1) lookup

        # return [stem(word) for word in tweet if (word not in sw and stem(word) not in sw)]

        return ['+'.join(i) for i in ngrams([stem(word) for word in
                tweet if word not in sw and stem(word) not in sw
                and len(word) > 2], 1, 2)]

Example #52

0

Show file

File: phraseParser.py Project: sumit-dutta/urgeAI_v3

def createMapper(valueList):

    mapperDict = {}
    for value in valueList:
        mappedTo = []
        mappedTo.append(value.lower())
        mappedTo.append(stem(value.lower()))
        mapperDict.update(dict.fromkeys(mappedTo,value))

    return mapperDict

Example #53

0

Show file

def mk_word_dic(f):
    word_dic = defaultdict(int)
    for sen in f:
        for w in sen[3:-1].split():
            w = stem(w)
            if stop_func(w):
                continue
            else:
                word_dic[w] += 1
    return word_dic

Example #54

0

Show file

def count_words_web(content):
    word_dicc = dict()
    content = [stem(s.lower()) for s in content.translate(table, string.punctuation).split()
               if len(s)<13 and len(s)>2
               and (s not in common_english_words)]
    e_cont = enumerate(content)
    for idx,word in e_cont:
        if word not in word_dicc: word_dicc[word] = 1
        else: word_dicc[word] += 1
    return word_dicc

Example #55

0

Show file

def price_q_keywords(keywords, pre):
    from stemming.porter2 import stem
    from textblob import TextBlob
    words = ['price', 'cost']
    if type(keywords) == list:
        keywords = copy.copy(keywords[0])
    text = TextBlob(keywords)
    logi = any(
        [word in [stem(w) for w in text.stripped.split()] for word in words])
    return logi

Example #56

0

Show file

File: query_parser.py Project: haffiiiiiiiiiiiiii/RoblySearch

def stem_token_list(words):
    """
    Function that uses the porter stemming algorithm to remove suffixes(and in some cases prefixes)
    in order to find the "root word" or stem of a given word.
    """
    stemmed_tokens = []
    for word in words:
        w = stem(word)
        stemmed_tokens.append(w)
    return stemmed_tokens

Example #57

0

Show file

File: stats.py Project: linron84/dissertation

 def get_logprob(self, word, alpha=0):
     """Return the log probability of a word in the corpus with optional
     additive smoothing.
     """
     word = word.lower()
     if self.stemming:
         word = porter2.stem(word)
     return np.log(self.word_counts[word] + alpha) - \
             np.log(self.word_counts[anyword] +
                    len(self.word_counts) * alpha)

Example #58

0

Show file

File: Docker_text_cluster_api.py Project: meekey463/Spyder

def clean_text(text):
    if text:
        #whitespaces
        # a = 'this product is       really good'
        clean = ' '.join(text.split())
        red_text = [stem(word) for word in clean.split()]
        return ' '.join(red_text)

    else:
        return text

Example #59

0

Show file

def cleanse_text(text):
    if text:
        # remove whitespace
        clean = ' '.join(text.split())
        # stemming
        red_text = [stem(i.lower()) for i in clean.split()]
        return ' '.join(red_text)

    else:
        return text

Example #60

0

Show file

def token_lower_nostop_stem_list(all_text, stopword_list):
    token_list = tokenisation_text(all_text)
    token_lowerlist = lower_word(token_list)
    token_lowerlist_nostop = [
        str(current_word) for current_word in token_lowerlist
        if str(current_word) not in stopword_list
    ]
    stem_list = [stem(current_word) for current_word in token_lowerlist_nostop]

    return stem_list