Example #1
0
def ngram_in_collection(ngram, coll):
    """
    Check if ngram's components are in collection
    """
    s1 = set([stem(word) for word in ngram.split(' ')])
    s2 = set([stem(word) for word in coll])
    return (len(s1.intersection(s2)) > 0)
Example #2
0
 def tokenize(self):
     punc = """\\.!?,(){}[]"'"""
     wordarray = []
     for c in self.document.lower().split():
         if stem(c.strip()) not in self.corpus.stopwords:
             wordarray.append(stem(c.strip(punc)))
     return wordarray
def tokenize_porter(title, body):
    """ Break text into words and stem user porter stemmer """
    # break up words & remove stopwords
    title_break = stopWords(nltk.word_tokenize(title), lower_case=True)
    body_break = stopWords(nltk.word_tokenize(body), lower_case=True)
    # print title_break
    return ["title:" + stem(title) for title in title_break] + ["body:" + stem(body) for body in body_break]
def makeFreqDictionaryOfSentenceWords(s1):
	words1 = s1.split();
	dt1 = {}
	for w in words1:
		if w.lower() not in stopwords:
			dt1[stem(w.lower())] = dt1.get(stem(w.lower()),0) + 1
	return dt1
Example #5
0
def find_collocations_tri(filename):
    text_file = open(filename, 'r')

    most_common_words = find_most_common_words(text_file, 100)

    second_word = None
    third_word = None
    fourth_word = None
    collocations = dict()

    text_file.seek(0)
    for line in text_file:
        for word in line.split():
            first_word = second_word
            second_word = third_word
            third_word = fourth_word
            fourth_word = trim_word(word)
            if (first_word not in most_common_words and second_word not in most_common_words and third_word not in most_common_words) and \
                    (first_word and first_word[0].islower() and second_word and second_word[0].islower() and third_word and third_word[0].islower()):
                count_collocations_tri(collocations, stem(first_word.lower()), stem(second_word.lower()), stem(third_word.lower()))

     #dodatkowa iteracja dla ostatniego slowa
    first_word = second_word
    second_word = third_word
    third_word = fourth_word                                   
    count_collocations_tri(collocations, first_word, second_word, third_word)
    sort_collocations_tri(collocations)                      
Example #6
0
    def get_pmi(self, word0, word1):
        """Return the pointwise mutual information, a measure of word
        association within a window, for two words. This is normalized
        using Bouma (2009) to avoid infinite values for OOV terms.
        """
        word0 = word0.lower()
        word1 = word1.lower()

        if self.stemming:
            word0 = porter2.stem(word0)
            word1 = porter2.stem(word1)

        if word0 not in self.word_counts or word1 not in self.word_counts:
            return -1

        if word0 < word1:
            pair_counts = self.word_pair_counts[word0][word1]
        else:
            pair_counts = self.word_pair_counts[word0][word1]

        if pair_counts == 0:
            return -1

        num_words = self.word_counts[anyword]

        # TODO: confirm normalization. Currently assuming words are
        # normalized by num_words and pairs by num_words^2.
        ratio = pair_counts / (self.word_counts[word0] *
                               self.word_counts[word1])
        pmi = np.log(ratio)
        normalized_pmi = - pmi / np.log(pair_counts / (num_words * num_words))

        return normalized_pmi
 def read(self, publication_keyword, publication_data):
     words = open(publication_keyword, 'r').readlines()
     for i in range(0, self.topic_number):
         s = stem(words[i].split('\t')[0])
         self.topics[ s ] = dict()
         self.stemword_dict[s] = words[i].split('\t')[0]
     content = open(publication_data, 'r').readlines()
     counter = 0
     year = ''
     for i in content:
         # three line represents a publication
         if counter % 3000 == 0:
             print (counter / 3)
         # record the year of this publication
         if counter % 4 == 1:
             year = int(i.strip())
         # parse the keywords of this publication
         elif counter % 4 == 3:
             keywords = i.strip().split(' ')
             for j in keywords:
                 j = stem(j)
                 if j in self.topics:
                     if year in self.topics[j]:
                         self.topics[j][year] += 1
                     else:
                         self.topics[j][year] = 1
         counter = counter + 1
Example #8
0
def freq(text,index):
	text = text.strip()
	textList = re.split('\W+',text)
	if len(textList) > 1:
		textList = [stem(word) for word in textList]
		setList = list()
		length = len(textList)-1
		for word in textList:

			if index.has_key(word)==False:
				print 0
				return
			wordSet = { (tuples[0], tuples[1]+length) for tuples in index[word]}
			setList.append(wordSet)
			length-=1
		docNum= setList[0]
		for Docset in setList:
			docNum = docNum & Docset
		print len(docNum)

	else:
		text = stem(textList[0])
		if index.has_key(text)==False:
			print 0
			return
		print len(index[text])
Example #9
0
def calculateScore(query,qID):
	

	sfile=open('../AP_DATA/stoplist.txt','r')
	sList=sfile.read().split('\n')
	query=query.lower()
	qList=re.findall("\w+[\.?\w+]*",query)
	temp=list()
	for term in qList:
		if term.endswith('.') & term.count('.')==1 & (len(term)>1):
			term=term.replace('.','')
		if term.startswith('_') & term.count('_') ==1 & (len(term)>1):
			term = term.replace('_','')
		temp.append(term)
	
	qList = temp
	#print index_num
	if index_num=='4':
		#print 123
		qList=[i for i in temp if i not in sList]
		temp=list() 
		for term in qList:
			term=stem(term)
			temp.append(term)
		qList=temp

	if index_num=='3':
		temp=list()
		for term in qList:
			term=stem(term)
			temp.append(term)
		qList=temp

	if index_num=='2':
		qList=[i for i in temp if i not in sList]		
Example #10
0
def find_collocations(file_name, data, popular_word):
    text_file = open(file_name, 'r')
    file_content = text_file.read()

    most_common_words = find_most_common_words(file_content, popular_word)

    second_word = None
    third_word = None
    collocations = data

    text_file.seek(0)
    for line in text_file:
        for word in line.split():
            first_word = second_word
            second_word = third_word
            third_word = trim_word(word)
            if (first_word not in most_common_words and second_word not in most_common_words) and \
                    (first_word and first_word[0].islower() and second_word and second_word[0].islower()):
                count_collocations(collocations, stem(first_word.lower()), stem(second_word.lower()))

    # dodatkowa iteracja dla ostatniego slowa
    first_word = second_word
    second_word = third_word
    count_collocations(collocations, first_word, second_word)

    collocations = find_whole_collocations_from_stems(collocations, file_content)
    return collocations, most_common_words, file_content
    def sentence_matches(self, sentence_text):
        """Returns true iff the sentence contains this mention's upstream
        and downstream participants, and if one of the stemmed verbs in
        the sentence is the same as the stemmed action type."""
        has_upstream = False
        has_downstream = False
        has_verb = False

        # Get the first word of the action type and assume this is the verb
        # (Ex. get depends for depends on)
        actiontype_words = word_tokenize(self.mention.actiontype)
        actiontype_verb_stemmed = stem(actiontype_words[0])

        words = word_tokenize(sentence_text)

        if self.string_matches_sans_whitespace(sentence_text.lower(),
            self.mention.upstream.lower()):
            has_upstream = True

        if self.string_matches_sans_whitespace(sentence_text.lower(),
            self.mention.downstream.lower()):
            has_downstream = True

        for word in words:
            if actiontype_verb_stemmed == stem(word):
                has_verb = True

        return has_upstream and has_downstream and has_verb
def find_collocations_penta(text, data, popular_word):
    
    most_common_words = find_most_common_words(text, popular_word)

    second_word = None
    third_word = None
    fourth_word = None
    fifth_word = None
    sixth_word = None
    collocations = data

    for word in text.split():
        first_word = second_word
        second_word = third_word
        third_word = fourth_word
        fourth_word = fifth_word
        fifth_word = sixth_word
        sixth_word = trim_word(word)
        if (first_word not in most_common_words and second_word not in most_common_words and third_word not in most_common_words and fourth_word not in most_common_words and fifth_word not in most_common_words) and \
                (first_word and first_word[0].islower() and second_word and second_word[0].islower() and third_word and third_word[0].islower() and fourth_word and fourth_word[0].islower() and fifth_word and fifth_word[0].islower() ):
            count_collocations_penta(collocations, stem(first_word.lower()), stem(second_word.lower()), stem(third_word.lower()), stem(fourth_word.lower()), stem(fifth_word.lower()))

    #dodatkowa iteracja dla ostatniego slowa
    first_word = second_word
    second_word = third_word
    third_word = fourth_word
    fourth_word = fifth_word
    fifth_word = sixth_word
    count_collocations_penta(collocations, first_word, second_word, third_word, fourth_word, fifth_word)
    return collocations, most_common_words
Example #13
0
def cleanText(text, entities, category):
	cleanText = text
	hashtags = entities.get('hashtags', [])
	ranges = []
	for hashtag in hashtags:
		if hashtag.get('text', '').lower() == category:
			indices = hashtag.get('indices')
			ranges.append(indices)
	urls = entities.get('urls', [])
	urls.reverse()
	ranges.extend([v for url in urls for k,v in url.iteritems() if k == 'indices'])
	media = entities.get('media', [])
	media.reverse()
	ranges.extend([v for medium in media for k,v in medium.iteritems() if k == 'indices'])
	ranges = sorted(ranges, key=lambda x: x[0], reverse=True)
	for r in ranges:
		cleanText = cleanText[:r[0]] + cleanText[r[1] + 1:]

	category_stem = stem(category).lower()
	cleanTextList = cleanText.split(' ')
	cleanText = []
	for word in cleanTextList:
		if category_stem not in stem(word).lower() and stem(word).lower() not in category_stem:
			cleanText.append(word)
	cleanText = " ".join(cleanText)
	return cleanText
Example #14
0
def main():
    nlp_file = open(sys.argv[1], "r")
    for line in nlp_file:
        words = line.strip().split(" ")
        for word in words:
            print stem(word)

    nlp_file.close()
Example #15
0
def getVocabularyStem(content):
    vocabulary = {}
    index = 0
    for i in range(len(content)):
        if stem(content[i]) not in vocabulary:
            vocabulary[stem(content[i])] = index
            index = index + 1
    return vocabulary
Example #16
0
def getSentTf(sent, stopwords):
	doc = dict()
	for word in re.split("[^a-zA-Z0-9]", sent):
		word = word.lower()
		if word != "" and word!="'" and stem(word) not in stopwords:
			if doc.get(stem(word), 0) == 0:
				doc[stem(word)] = 1
			else:
				doc[stem(word)] = doc[stem(word)]+1
	return doc
 def filter(self):
     # do not generate html file, just filter the correct relationships
     correct_list = list()
     for i in range(0, len(self.linklist)):
         key0 = stem(self.linklist[i][0])
         key1 = stem(self.linklist[i][1])
         if self.judge(key0, key1, i) is False:
             continue
         correct_list.append(i)
     return correct_list
Example #18
0
def get_word_embedding(word, w2vmodel):
    if word in w2vmodel:
        return w2vmodel[word]
    elif stem(word) in w2vmodel:
        return w2vmodel[stem(word)]
    elif word.lower() in w2vmodel:
        return w2vmodel[word.lower()]
    elif stem(word.lower()) in w2vmodel:
        return w2vmodel[stem(word.lower())]
    else:
        return None
Example #19
0
def form_regex_for_common_words():
    expr = ""
    count = 0
    common_words = fp.read().split()
    for word in common_words:
        count+= 1
        if count == len(common_words):
            expr += "^"+stem(word)+"$"
        else:
            expr += "^"+stem(word)+"$|"
    return expr
Example #20
0
def naive_wc_sim(str1, str2):
  list1 = nltk.word_tokenize(str1)
  list2 = nltk.word_tokenize(str2)
  count = 0
  for w1 in list1:
    stw1 = stem(w1)
    for w2 in list2:
      stw2 = stem(w2)
      if stw1 == stw2:
        count += 1
  return (1.0*count)/(1.0*min(len(list1), len(list2)))
Example #21
0
def getDocTf(fileName, stopwords):
	doc = dict()
	with open(fileName, "r") as fi:
		for line in fi:
			for word in re.split("[^a-zA-Z0-9]", line.strip()):
				word = word.lower()
				if word != "" and word!="'" and stem(word) not in stopwords:
					if doc.get(stem(word), 0) == 0:
						doc[stem(word)] = 1
					else:
						doc[stem(word)] = doc[stem(word)]+1
	return doc
Example #22
0
def overlapMeasure(strA, strB, stopwords):
  # Split and lowercase tokens

  tokA = [x.lower() for x in strA.split(' ') if x != 'what' and x != 'why' and x != 'how']
  tokB = [x.lower() for x in strB.split(' ') if x != 'what' and x != 'why' and x != 'how']
  try:
    from stemming.porter2 import stem
    tokA = [stem(x) for x in tokA]
    tokB = [stem(x) for x in tokB]
  except:
    pass
  overlap = naiveOverlap(tokA, tokB, stopwords)
  return overlap
Example #23
0
    def process_item(self, item, spider):
        url = item["url"]
        title = item["title"]
        main = item["content"]

        title = re.findall(r'[A-Za-z0-9]\w*', title.lower())
        main = re.findall(r'[A-Za-z0-9]\w*', main.lower())

        for i in range(len(main)):
            main[i] = stem(main[i])
        for i in range(len(title)):
            title[i] = stem(title[i])
        delWord = dict(nltk.pos_tag(main))
        for i in delWord:
            if delWord[i] == 'DT' or delWord[i] == 'IN' or delWord[i] == 'CC' or delWord[i] == 'TO':
                for j in range(main.count(i)):
                    main.remove(i)

        delWord = dict(nltk.pos_tag(title))
        for i in delWord:
            if delWord[i] == 'DT' or delWord[i] == 'IN' or delWord[i] == 'CC' or delWord[i] == 'TO':
                for j in range(title.count(i)):
                    title.remove(i)

        new_main = main + title
        main_pos = {}
        for i in range(len(new_main)):
            if main_pos.get(new_main[i], 0) == 0:
                main_pos[new_main[i]] = [i]
            else:
                main_pos[new_main[i]].append(i)
        main = Counter(main)
        title = Counter(title)

        for i in title:
            title[i] *= 2
        for i in title:
            title[i] = max(title[i], main.get(i, 0)) * 2 + min(title[i], main.get(i, 0))
            main[i] = 0
        main.update(title)

        return {
            "url": url,
            "title": item["title"],
            "content": item["content"],
            "words": main,
            "wordspos": main_pos
        }
Example #24
0
def visit(word, depth):
    if depth > max_depth:
        return
    if word in visited:
        return

    word_stem = stem(word)

    visited.add(word)

    if not word in freqs:
        freqs[word] = dict()
        stems[word] = dict()

    text = DictionaryServices.DCSCopyTextDefinition(None, word, (0, len(word)))
    if not text or len(text) == 0:
        return

    # We don't care about any of the origin/etymology data, so remove it
    text = text.split('ORIGIN')[0]
    # Remove any punctuation, weird characters, etc.
    filtered_text = re.sub(r'[\W\d]+', ' ', text).lower()
    words = filtered_text.split()

    for w in words:
        w_stem = stem(w)
        if w != word and len(w) >= min_word_len and w_stem != word_stem:
            if not w in freqs:
                freqs[w] = dict()
                stems[w] = dict()

            if w_stem not in stems[word]:
                freqs[word][w] = 1 if w not in freqs[word] else freqs[word][w] + 1
                stems[word][w_stem] = w
            else:
                same_stem = stems[word][w_stem]
                freqs[word][same_stem] = freqs[word][same_stem] + 1

            if word_stem not in stems[w]:
                freqs[w][word] = 1 if word not in freqs[w] else freqs[w][word] + 1
                stems[w][word_stem] = word
            else:
                same_stem = stems[w][word_stem]
                freqs[w][same_stem] = freqs[w][same_stem] + 1

            unigram_freqs[w] = 1 if w not in unigram_freqs else unigram_freqs[w] + 1

            visit(w, depth + 1)
def processQuery():
    stopwords = set()
    stopfile = open("stoplist.txt")
    for stopword in stopfile:
        stopwords.add(stopword.rstrip())
    
    queries = loadQueries()
    for queryString in queries.keys():
        
        query = queries[queryString];
        
        # handle dots (".") U.S. becomes US
        query = ''.join(e for e in query if e != '.')
        
        # remove punctuation
        query = re.sub('[^a-zA-Z0-9\n\.]', ' ', query).rstrip()
        
        # remove stop words
        result = query.split(" ")
        mystr = '';
        for term in result:
            if term == '' or term == ' ': continue
            if term not in stopwords:
                # convert to lower case
                term = str(term).lower()
                term = str(stem(term))
                mystr += term + ' '
        queriesdict[queryString] = mystr.rstrip()        
def ToVS(text):
    VS=dict()
    text=text.lower()
    
    VS["#!"]=len(re.findall("!",text))
    VS["#?"]=len(re.findall("\\?",text))
    VS["#()"]=len(re.findall("\\(|\\)",text))
    VS["#numbers"]=len(re.findall("\\d+",text))
    VS["##"]=len(re.findall("#",text))
    VS["#{}"]=len(re.findall("\\{|\\}",text))
    VS["#[]"]=len(re.findall("\\[|\\]",text))
    VS["#comparison"]=len(re.findall("<|>|=",text))
    VS['#"']=len(re.findall('"',text))
    VS['#math']=len(re.findall('\\+|\\-|\\*|\\/',text))
    VS['#_']=len(re.findall('_',text))
    VS['#oneCharacter']=0
    for c in string.punctuation:
        if c!="-" and c!="_":
            text=text.replace(c," ")
        else:
            text=text.replace(c,"")
    text=re.sub("[\\s|\\d]+"," ",text)
    text=filter(lambda x: x in string.printable, text)
    for w in text.split(" "):
        word=stem(w)
        if word!="":
            if len(word)==1:
                VS["#oneCharacter"]+=1
            elif VS.has_key(word):
                VS[word]+=1
            else:
                VS[word]=1
    return VS
    def __clean__(self, text):
        """
        Clean up a document through stemming and stop word removal.
        Stemming is the act of removing suffixes from a word to limit variation between verb tenses.
        Stop word remove is the act of removing common words from the document that likely play no meaning in the
        significance of the document.
        :param document: The document to be cleaned
        :return: The given document after stemming and stop word removal
        """
        text = re.sub("((http:|https:|ftp:|ftps:)//[\w$-_.+!*'(),%=]+)", '', text)
        text = re.sub("(@[\w_]+)", '', text)
        text = re.sub("(#[\w!$-_.+!*'(),%=]+)", '', text)
        text = re.sub("\p{P}+", '', text)
        text = re.sub("[\'\":#,!&]+", '', text)
        
        stopwords = ["a","a's","able","about","above","according","accordingly","across","actually","after","afterwards","again","against","ain't","all","allow","allows","almost","alone","along","already","also","although","always","am","among","amongst","an","and","another","any","anybody","anyhow","anyone","anything","anyway","anyways","anywhere","apart","appear","appreciate","appropriate","are","aren't","around","as","aside","ask","asking","associated","at","available","away","awfully","b","be","became","because","become","becomes","becoming","been","before","beforehand","behind","being","believe","below","beside","besides","best","better","between","beyond","both","brief","but","by","c","c'mon","c's","came","can","can't","cannot","cant","cause","causes","certain","certainly","changes","clearly","co","com","come","comes","concerning","consequently","consider","considering","contain","containing","contains","corresponding","could","couldn't","course","currently","d","definitely","described","despite","did","didn't","different","do","does","doesn't","doing","don't","done","down","downwards","during","e","each","edu","eg","eight","either","else","elsewhere","enough","entirely","especially","et","etc","even","ever","every","everybody","everyone","everything","everywhere","ex","exactly","example","except","f","far","few","fifth","first","five","followed","following","follows","for","former","formerly","forth","four","from","further","furthermore","g","get","gets","getting","given","gives","go","goes","going","gone","got","gotten","greetings","h","had","hadn't","happens","hardly","has","hasn't","have","haven't","having","he","he's","hello","help","hence","her","here","here's","hereafter","hereby","herein","hereupon","hers","herself","hi","him","himself","his","hither","hopefully","how","howbeit","however","i","i'd","i'll","i'm","i've","ie","if","ignored","immediate","in","inasmuch","inc","indeed","indicate","indicated","indicates","inner","insofar","instead","into","inward","is","isn't","it","it'd","it'll","it's","its","itself","j","just","k","keep","keeps","kept","know","known","knows","l","last","lately","later","latter","latterly","least","less","lest","let","let's","like","liked","likely","little","look","looking","looks","ltd","m","mainly","many","may","maybe","me","mean","meanwhile","merely","might","more","moreover","most","mostly","much","must","my","myself","n","name","namely","nd","near","nearly","necessary","need","needs","neither","never","nevertheless","new","next","nine","no","nobody","non","none","noone","nor","normally","not","nothing","novel","now","nowhere","o","obviously","of","off","often","oh","ok","okay","old","on","once","one","ones","only","onto","or","other","others","otherwise","ought","our","ours","ourselves","out","outside","over","overall","own","p","particular","particularly","per","perhaps","placed","please","plus","possible","presumably","probably","provides","q","que","quite","qv","r","rather","rd","re","really","reasonably","regarding","regardless","regards","relatively","respectively","right","s","said","same","saw","say","saying","says","second","secondly","see","seeing","seem","seemed","seeming","seems","seen","self","selves","sensible","sent","serious","seriously","seven","several","shall","she","should","shouldn't","since","six","so","some","somebody","somehow","someone","something","sometime","sometimes","somewhat","somewhere","soon","sorry","specified","specify","specifying","still","sub","such","sup","sure","t","t's","take","taken","tell","tends","th","than","thank","thanks","thanx","that","that's","thats","the","their","theirs","them","themselves","then","thence","there","there's","thereafter","thereby","therefore","therein","theres","thereupon","these","they","they'd","they'll","they're","they've","think","third","this","thorough","thoroughly","those","though","three","through","throughout","thru","thus","to","together","too","took","toward","towards","tried","tries","truly","try","trying","twice","two","u","un","under","unfortunately","unless","unlikely","until","unto","up","upon","us","use","used","useful","uses","using","usually","uucp","v","value","various","very","via","viz","vs","w","want","wants","was","wasn't","way","we","we'd","we'll","we're","we've","welcome","well","went","were","weren't","what","what's","whatever","when","whence","whenever","where","where's","whereafter","whereas","whereby","wherein","whereupon","wherever","whether","which","while","whither","who","who's","whoever","whole","whom","whose","why","will","willing","wish","with","within","without","won't","wonder","would","wouldn't","x","y","yes","yet","you","you'd","you'll","you're","you've","your","yours","yourself","yourselves","z","zero"]
        sb = []
        text = text.lower()
        re.sub(r'[\W_]+', '', text)
        for term in text.split():
            term = stem(term)
            if term not in stopwords:
                sb.append(term)

        return ' '.join(sb)
Example #28
0
def index(urls):
    """
    Goal:  Download a list of webpage
    
    Parameter:
    urls:  list of strings, which represent the address of each webpage 
    
    """    
    
    if not os.path.isdir('files'):
        os.makedirs('files')
    
    
    for webpage in urls:
        name = webpage.split('/')[-1]
        os.system("wget "+webpage+ " -q -O files/"+name)
        logging.info("Downloaded: "+ name )
        
    b_o_w = {}
    
    for web_file in os.listdir('files'):
        
        try:
            text_html = open('files/'+web_file,'r').read();
            text = [stem(word.lower()) for word in html2text(text_html).split()]
            b_o_w[web_file] = text
            logging.info("Tokenized: "+web_file)
        except :
            #Something strange happened with the webpage of New_York_City
            print ("There is a problem with "+web_file)
    
    index_file = open("index_file.pck", "w") 
    pickle.dump(b_o_w, index_file)
    index_file.close()
Example #29
0
def get_trcmparer_sim(origin_sentences):

    flat_sentences = []
    stopwords = get_stopwords("english_stopwords.txt")
    for sentence in origin_sentences:
        sent_tmp = []
        for word in sentence:
            if word.isalnum():
                word = word.lower()
                if word not in stopwords:
                    stemmed = stem(word)
                    sent_tmp.append(stemmed)
        flat_sentences.append(sent_tmp)
    # print len(flat_sentences)

    trcmp_matrix = np.zeros((len(flat_sentences), len(flat_sentences)),dtype=np.float32)
    for i in range(0,len(flat_sentences)):
        for j in range(i+1,len(flat_sentences)):
            if len(flat_sentences[i]) == 0 or  len(flat_sentences[j]) == 0:
                continue
            intersection_word = intersectionSet(flat_sentences[i],flat_sentences[j])
            trcmp_matrix[i][j] = (len(intersection_word)*1.0)/(np.log(len(flat_sentences[i])+1)+np.log(len(flat_sentences[j])+1))
            trcmp_matrix[j][i] = trcmp_matrix[i][j]
    trcmp_matrix = scale_0_1(trcmp_matrix)
    return trcmp_matrix
Example #30
0
def get_sent_embedding_w2v(sent, w2vmodel, mode, tfidf_vectorizer = None):
    result_vec = []
    set_words = set(sent)

    for w_idx,word in enumerate(set_words):
        if word.isalnum() == False:
            continue
        word_vector = get_word_embedding(word,w2vmodel)
        if word_vector is None:
            # print word
            continue
        tf_word = sent.count(word)
        if tfidf_vectorizer is not None:
            stemmed = stem(word)
            if word in tfidf_vectorizer.vocabulary_:
                idf_word = tfidf_vectorizer.idf_[tfidf_vectorizer.vocabulary_[word]]
            elif stemmed in tfidf_vectorizer.vocabulary_:
                idf_word = tfidf_vectorizer.idf_[tfidf_vectorizer.vocabulary_[stemmed]]
            else:
                idf_word = 0
            tf_idf_word = tf_word * idf_word
            tf_idf_array = np.array([tf_idf_word])
            word_vector = np.concatenate((word_vector, tf_idf_array))
        result_vec.append(word_vector)
    result_vec = np.array(result_vec)
    if mode == "mean":
        return np.mean(result_vec,axis=0)
    else:
        return np.sum(result_vec,axis=0)
def stem_inventory(inventory):
    words = inventory.split(" ")
    stem_words = []
    for word in words:
        if "_" in word or '-' in word or "\\" in word: # if they are compound word, then don't stem
            stem_words.append(word)
        else: # if not stem
            stem_words.append(stem(word))
    return stem_words
Example #32
0
def removeStems(data):
	"""
	Purpose: 	Computes the stem of each word in the passed word list
	Returns:	A list containing a stem for each word in the words list
	"""
	stemList = []
	for d in data:
		stemList.append(stem(d))
	return stemList
def foodwordList(keyword):
    tweets = twitterImport.getTweets(keyword)
    tmpList = []
    for tweet in tweets['statuses']:
        tmpList.append([stem(word) for word in tweet['text'].split(" ")])
    finalList = []
    for list in tmpList:
        finalList = finalList + list
    return finalList
Example #34
0
def create_features(x):
    phi = defaultdict(lambda: 0)

#    words = x.split()
    for word in x:
        word = stem(word)
        phi["UNI:" + word] += 1

    return phi
Example #35
0
def search_fast(term: str, vocab: pd.DataFrame) -> list:
    stemmed_term = stem(term.lower())
    options = []
    for row in vocab[vocab["stemmed"].str.contains(stemmed_term)].itertuples(
            index=True):
        options.append((row.URI, row.Label))
        if len(options) >= 151:
            break
    return options
Example #36
0
def addTerm(line, curr_doc, stopWordsList):
    for term in line.split():
        term = term.lower()
        term = stem(term)
        if (term not in stopWordsList and len(term) > 3):
            try:
                curr_doc[term] += 1
            except KeyError:
                curr_doc[term] = 1
Example #37
0
def clean(doc):
    remove_digits = str.maketrans('', '', digits)
    doc = doc.translate(remove_digits)
    cleantext = BeautifulSoup(doc, "html.parser").text
    doc = re.sub(cleanr, ' ', doc)
    doc = doc.replace("<div>", " ")
    doc = doc.replace("</div>", " ")
    doc = doc.replace(".</div>", " ")
    doc = doc.replace("<br />", " ")
    doc = doc.replace(".", " ")
    doc = doc.replace(":", " ")
    doc = doc.replace(",", " ")
    doc = doc.replace("_", " ")
    doc = doc.replace('-', ' ')
    doc = doc.replace('(', ' ')
    doc = doc.replace(')', ' ')
    doc = doc.replace('#', ' ')
    doc = doc.replace('/', ' ')
    doc = doc.replace(" div ", " ")
    doc = doc.replace(" br ", " ")
    doc = doc.replace("nbsp", " ")
    doc = doc.replace("ndash", " ")
    doc = doc.replace("&rsquo;", ' ')
    doc = doc.replace("&trade;", ' ')
    doc = re.sub(r"\&([^;.]*);", " ", doc)
    doc = re.sub(r"([0-9]+)-([0-9]+)", " ", doc)
    doc = re.sub(r"\d", " ", doc)
    stop_free = " ".join([i for i in doc.lower().split() if i not in stop])
    punc_free = ''.join(ch for ch in stop_free if ch not in exclude)
    punc_free = re.sub(r"\b\d+\b", " ", punc_free)
    words = word_tokenize(punc_free)
    lemmatized_words = [lemma.lemmatize(word) for word in words]
    #stemmed_word = [stem(w) for w in lemmatized_words]
    #stemmed_word = [stem(w) for w in words]
    finallist = []
    for ch in lemmatized_words:
        if len(ch) > 2 and len(ch) < 13 and ch.encode('utf-8').isalnum(
        ) == True and bool(re.search(r'\d', ch)) == False:
            try:
                finallist.append(stem(vocab_mapper[ch]))
            except:
                finallist.append(stem(ch))
    final = " ".join(finallist)
    return final
Example #38
0
def lyrics_to_bow(lyrics):
    """
    Main function to stem and create bag of words.
    It is what we used for the musiXmatch dataset.
    It is heavily oriented towards English lyrics, we apologize for that.
    INPUT
        lyrics as a string
    RETURN
        dictionary word -> count
        or None if something was wrong (e.g. not enough words)
    """
    # remove end of lines
    lyrics_flat = lyrics.replace('\r', '\n').replace('\n', ' ').lower()
    lyrics_flat = ' ' + lyrics_flat + ' '
    # special cases (English...)
    lyrics_flat = lyrics_flat.replace("'m ", " am ")
    lyrics_flat = lyrics_flat.replace("'re ", " are ")
    lyrics_flat = lyrics_flat.replace("'ve ", " have ")
    lyrics_flat = lyrics_flat.replace("'d ", " would ")
    lyrics_flat = lyrics_flat.replace("'ll ", " will ")
    lyrics_flat = lyrics_flat.replace(" he's ", " he is ")
    lyrics_flat = lyrics_flat.replace(" she's ", " she is ")
    lyrics_flat = lyrics_flat.replace(" it's ", " it is ")
    lyrics_flat = lyrics_flat.replace(" ain't ", " is not ")
    lyrics_flat = lyrics_flat.replace("n't ", " not ")
    lyrics_flat = lyrics_flat.replace("'s ", " ")
    # remove boring punctuation and weird signs
    punctuation = (',', "'", '"', ",", ';', ':', '.', '?', '!', '(', ')',
                   '{', '}', '/', '\\', '_', '|', '-', '@', '#', '*')
    for p in punctuation:
        lyrics_flat = lyrics_flat.replace(p, '')
    words = filter(lambda x: x.strip() != '', lyrics_flat.split(' '))
    # stem words
    words = map(lambda x: stem(x), words)
    bow = {}
    for w in words:
        if not w in bow.keys():
            bow[w] = 1
        else:
            bow[w] += 1
    # remove special words that are wrong
    fake_words = ('>', '<', 'outro~')
    bowwords = bow.keys()
    for bw in bowwords:
        if bw in fake_words:
            bow.pop(bw)
        elif bw.find(']') >= 0:
            bow.pop(bw)
        elif bw.find('[') >= 0:
            bow.pop(bw)
    # not big enough? remove instrumental ones among others
    if len(bow) <= 3:
        return None
    # done

    return bow
Example #39
0
def meaning_bag(pos, url):
    '''
    Return a "meaning bag" given a WordSmyth url. A meaning bag is a bag
    of stemmed words derived from a WordSmyth url of a definition page.

    pos - either "noun" or "verb"
    url - WordSmyth url (e.g. "https://www.wordsmyth.net/?level=3&ent=dog")
    '''

    # download page to file
    filename = wget.download(url, out='{}.html'.format(uuid.uuid4().hex))
    page = open(filename).read()
    soup = BeautifulSoup(page, 'html.parser')
    # delete file !!!
    os.remove(filename)

    bag = set()
    maintable = soup.tbody.find('table', class_="maintable")
    correct_pos = False
    for tr in maintable.tbody.findChildren():

        if tr.get("class") and tr.get("class")[0] == "postitle":
            if tr.find("td", class_="data").a:
                pos_ = tr.find("td", class_="data").a.text
            else:
                pos_ = tr.find("td", class_="data").text

            if pos in pos_.split():
                correct_pos = True
            else:
                correct_pos = False

        elif tr.get("class") and tr.get(
                "class")[0] == "definition" and correct_pos:
            def_ = tr.find("td", attrs={
                'class': 'data'
            }).find_all(text=True, recursive=False)[0]
            # update bag with words from definition
            bag.update([t.lower() for t in tokenizer.tokenize(def_)])
            # check for "similar words"
            if tr.find("td", attrs={'class': 'data'}).find("dl"):
                sim_words = tr.find("td", attrs={'class': 'data'}).dl.dd.a.text
                # update bag with words from "related words" section
                bag.update([t.lower() for t in tokenizer.tokenize(sim_words)])

        elif tr.get("class") and tr.get(
                "class")[0] == "related_word" and correct_pos:
            rel_words = tr.find("td", class_="data").a.text
            # update bag with words from "related words" section
            bag.update([t.lower() for t in tokenizer.tokenize(rel_words)])

    # remove stopwords
    bag -= STOPWORDS
    # stem words
    bag = set([stem(w) for w in bag])
    return (bag)
Example #40
0
	def document_to_query(self, doc):
		""" Given a document it transforms the source code related fields to a lucene query string """
		query = ""
		for field in ["description"]:
			for val in doc.getFields(field):
				if val.stringValue().strip():
					term = QueryParser.escape(val.stringValue())
					# tokenize
					term = self.tokenize_string(StandardAnalyzer(), term)
					# CamelCase
					temp = []
					for t in term:
						temp += self.camel_case_split(t)
					# stopwords
					temp_2 = []

					for t in temp:
						if t not in english_stop_words:
							temp_2.append(t)
					# stemming
					temp_3 = []
					for t in temp_2:
						temp_3.append(stem(t))
					# stopwords
					temp_4 = []

					for t in temp_3:
						if t not in english_stop_words:
							temp_4.append(t)
					# query generation
					for term in temp_4:
						query += "%s:%s " % (field, term)

		for field in ["typed_method_call", "methods", "used_classes", "class_instance_creation", "methods_called",
					  "annotations", "literals"]:  # "used_classes", , "literals" , "extends"
			for val in doc.getFields(field):
				if val.stringValue().strip():
					term = QueryParser.escape(val.stringValue())
					java_stoplist = ["java.lang.Object", 'void', 'Global', 'boolean', 'String', 'int', 'char', 'float',
									 'double', 'write', 'close', 'from', 'println', 'StringBuilder', 'write',
									 'toString',
									 'close', 'mkdir', 'exists']

					if term not in java_stoplist:
						query += "%s:%s " % (field, term)

		if len(doc.getFields("code_hints")) > 0:
			hints = [hint.stringValue() for hint in doc.getFields("code_hints")]
			hints_str = " ".join(hints)
			for term in hints:
				if term:
					term = QueryParser.escape(term)
					if term not in english_stop_words:
						# print "Including 'code_hints' from Doc_To_Query TERMs... //", term
						query += "code_hints:%s " % term
		return query
Example #41
0
def processscript(filename):
    print '\n' + filename
    f = open(filename, 'r')

    s = f.read()

    s = s.replace('\\n', ' ')
    s = s.replace('\\t', ' ')
    s = re.sub(r'[^a-zA-Z]', r'\t', s)

    #print 'Tokenizing...'
    x = wordpunct_tokenize(s)
    tokenized = len(x)

    #print 'Removing words of length 1-2...'
    list = []
    for word in x:
        if len(word) > 2:
            list.append(word)
    remove12 = len(list)

    fin = {}

    #print 'Stemming...'
    for wd in list:
        tem = stemport.stem(wd)
        if tem in fin:
            fin[tem] = fin[tem] + 1
        else:
            fin[tem] = 1
    stemmed = len(fin)

    #print 'Removing stop words...'
    f = open('stop.txt', 'r')
    for line in f:
        for word in line.split():
            if (word in fin):
                #print word
                del fin[word]
    #print fin
    stopped = len(fin)

    print 'tokenized:'
    print tokenized

    print 'remove12:'
    print remove12

    print 'stemmed:'
    print stemmed

    print 'stopped:'
    print stopped

    str = ''.join('%s ' % (k) for k, v in fin.iteritems())
    return str
Example #42
0
 def extract_stem(self, sentence):
     if self._language == 'ko':
         spaced = self._morphs(
             unicodedata.normalize('NFKC',
                                   sentence.strip()).translate(self._table))
     elif self._language == 'en':
         spaced = [
             stem(j) for j in self._morphs(self.normalize_string(sentence))
         ]
     return spaced
Example #43
0
def tokenise_stem(text):
    '''removes punctuation, lowers all the characters and returns a list of all the stemmed words split at space or newline'''
    punct = re.compile('[%s]' % re.escape(string.punctuation))
    dig = re.compile('\d')
    text_clean1 = punct.sub('', text)
    text_clean = dig.sub(
        '0', text_clean1)  #convert all digits to 0 to optimize indexing
    tokens = text_clean.split()
    stemmed_tokens = [stem(token) for token in tokens]
    return lower(stemmed_tokens)
Example #44
0
 def fp_steps(self, text):
     title = text.strip().lower()
     title_splchar_removed = self.remove_spl_char_regex.sub(" ", title)
     title_number_removed = self.remove_num.sub("", title_splchar_removed)
     words = title_number_removed.split()
     filter_stop_words = [
         w for w in words if not w in nltk.corpus.stopwords.words('english')
     ]
     stemed = [stem(w) for w in filter_stop_words]
     return sorted(stemed)
Example #45
0
def get_pages(search_query):
    index = indexer.construct_index('indices/index_1.txt')
    search_query = nltk.word_tokenize(search_query)
    search_query = [stem(word.lower()) for word in search_query]
    print search_query
    pages = set(index.get(search_query[0]))
    for i in xrange(1, len(search_query)):
        word = search_query[i]
        pages = pages.intersection(set(index.get(word)))
    return list(pages)
Example #46
0
def clean_text(text: str) -> str:
    """Clean text for TFIDF."""
    new_text = re.sub(r'\p{P}+', ' ', text)

    new_text = [stem(i) for i in new_text.lower().split() if not
    re.findall(r'[0-9]', i)]

    new_text = ' '.join(new_text)

    return new_text
Example #47
0
def normalize(sentence, bad_words=_bad_words):
    res = set()
    if isinstance(sentence, (str, unicode)):
        tokens = token_reg.split(sentence.lower().strip())
        for token in tokens:
            if len(token) > 2 and token not in bad_words:
                stemmed = stem(token)
                if stemmed not in stop_words_en:
                    res.add(token)
    return res
Example #48
0
def extract_feature(sent):
    with open("./data/stop_words.txt", "r") as f:
        stop_words = [x.strip() for x in f]
    features = []
    for word in sent:
        if word in stop_words:
            continue
        else:
            features.append(stem(word.strip()))
    return features
def encodeName(name, wordBag):
    encoding = [0 for word in wordBag]  # 0s for all characters
    # result.append(1/len(text))
    # result.append(1/len(text.split()))
    name = name.lower()
    tokens = word_tokenize(name)
    for t in tokens:
        root = stem(t)
        encoding[wordBag.index(root)] = 1
    return encoding
Example #50
0
def baseline(line, stop_list):
    word_ls = line.split()
    # ストップワード除去
    for word in word_ls:
        if check_stop_words(word):
            word_ls = filter(lambda a: a != word, word_ls)
    # ステミング
    for i in range(len(word_ls)):
        word_ls[i] = stem(word_ls[i].strip())
    return word_ls
Example #51
0
    def clean_up(tweet):

        # Perform porter stemmer and remove any STOPWORDS

        tweet = ' '.join([word for word in tweet.split(' ')
                         if not word.startswith('#')
                         and not word.startswith('@')
                         and not word.startswith('http')
                         and not word.startswith('www')])

        tweet = to_alphanum(tweet).lower()
        tweet = tweet.split(' ')
        sw = set(STOPWORDS)  # Allows for O(1) lookup

        # return [stem(word) for word in tweet if (word not in sw and stem(word) not in sw)]

        return ['+'.join(i) for i in ngrams([stem(word) for word in
                tweet if word not in sw and stem(word) not in sw
                and len(word) > 2], 1, 2)]
Example #52
0
def createMapper(valueList):

    mapperDict = {}
    for value in valueList:
        mappedTo = []
        mappedTo.append(value.lower())
        mappedTo.append(stem(value.lower()))
        mapperDict.update(dict.fromkeys(mappedTo,value))

    return mapperDict
Example #53
0
def mk_word_dic(f):
    word_dic = defaultdict(int)
    for sen in f:
        for w in sen[3:-1].split():
            w = stem(w)
            if stop_func(w):
                continue
            else:
                word_dic[w] += 1
    return word_dic
Example #54
0
def count_words_web(content):
    word_dicc = dict()
    content = [stem(s.lower()) for s in content.translate(table, string.punctuation).split()
               if len(s)<13 and len(s)>2
               and (s not in common_english_words)]
    e_cont = enumerate(content)
    for idx,word in e_cont:
        if word not in word_dicc: word_dicc[word] = 1
        else: word_dicc[word] += 1
    return word_dicc
Example #55
0
def price_q_keywords(keywords, pre):
    from stemming.porter2 import stem
    from textblob import TextBlob
    words = ['price', 'cost']
    if type(keywords) == list:
        keywords = copy.copy(keywords[0])
    text = TextBlob(keywords)
    logi = any(
        [word in [stem(w) for w in text.stripped.split()] for word in words])
    return logi
def stem_token_list(words):
    """
    Function that uses the porter stemming algorithm to remove suffixes(and in some cases prefixes)
    in order to find the "root word" or stem of a given word.
    """
    stemmed_tokens = []
    for word in words:
        w = stem(word)
        stemmed_tokens.append(w)
    return stemmed_tokens
Example #57
0
 def get_logprob(self, word, alpha=0):
     """Return the log probability of a word in the corpus with optional
     additive smoothing.
     """
     word = word.lower()
     if self.stemming:
         word = porter2.stem(word)
     return np.log(self.word_counts[word] + alpha) - \
             np.log(self.word_counts[anyword] +
                    len(self.word_counts) * alpha)
def clean_text(text):
    if text:
        #whitespaces
        # a = 'this product is       really good'
        clean = ' '.join(text.split())
        red_text = [stem(word) for word in clean.split()]
        return ' '.join(red_text)

    else:
        return text
Example #59
0
def cleanse_text(text):
    if text:
        # remove whitespace
        clean = ' '.join(text.split())
        # stemming
        red_text = [stem(i.lower()) for i in clean.split()]
        return ' '.join(red_text)

    else:
        return text
Example #60
0
def token_lower_nostop_stem_list(all_text, stopword_list):
    token_list = tokenisation_text(all_text)
    token_lowerlist = lower_word(token_list)
    token_lowerlist_nostop = [
        str(current_word) for current_word in token_lowerlist
        if str(current_word) not in stopword_list
    ]
    stem_list = [stem(current_word) for current_word in token_lowerlist_nostop]

    return stem_list