Beispiel #1
0
def getFormattingFeatures(obj):
  question = obj["question_text"].strip()
  topics = [t["name"] for t in obj["topics"]]
  tokens = [w for w in wordpunct_tokenize(question) if not re.match(r"[\'\"\.\?\!\,\/\\\(\)\`]", w)]
  punct = [p for p in wordpunct_tokenize(question) if re.match(r"[\'\"\.\?\!\,\/\\\(\)\`]", p)]
  top_toks = set([w.lower() for t in obj["topics"] for w in wordpunct_tokenize(t["name"])])
  qn_toks  = set(tokens)
  qn_topic_words = len(top_toks & qn_toks)
  start_cap = 1 if re.match(r"^[A-Z]", question) else 0
  if len(tokens) > 0:
    qn_type = [1 if sum(1.0 for w in tokens if w in qws) else 0 for qws in qn_type_words]
  else:
    # penalize having no token words
    qn_type = [-1.0] * len(qn_type_words)
  total_words = len(tokens)
  correct_form_count = sum(1.0 for w in tokens if (not re.match(r"^[A-Z]+$", w)) or re.match(r"^[A-Z]", w))
  topic_word_ratio1  = max(0, qn_topic_words - 2) / float(total_words + 1)
  topic_word_ratio2  = max(0, 2 - qn_topic_words) / float(total_words + 1)
  topic_word_ratio   = qn_topic_words / float(total_words + 1)
  punctuation_ratio  = len(punct) / float(total_words + 1)
  word_overshoot = max(0, total_words - 10.1)
  word_undershoot = max(0, 10.1 - total_words)
  result = [
    start_cap,
    punctuation_ratio,
    math.log(len(topics) + 1),
    topic_word_ratio1,
    topic_word_ratio2,
    topic_word_ratio,
    word_overshoot,
    word_undershoot,
   ] + qn_type
  return result
Beispiel #2
0
def getResult(textFile, ind1, ind2, outFile, outFile2):
	fout = open(outFile,"w")
	fout2 = open(outFile2, "w")
	#probs = []
	for line in open(textFile):
		hyp1 = wordpunct_tokenize(line.strip().split("|||")[ind1].strip().decode("utf-8"))
		hyp2 = wordpunct_tokenize(line.strip().split("|||")[ind2].strip().decode("utf-8"))

		f = open("temp.txt","w")
		f.write("%s\n"%" ".join([x.encode("utf-8") for x in hyp1]))
		f.close()
		os.system("~/Course/AMMML/project/FeatureAugmentedRNNToolkit/rnnlm -rnnlm ~/Course/AMMML/project/FeatureAugmentedRNNToolkit/model -test temp.txt -features-matrix ~/Course/AMMML/project/FeatureAugmentedRNNToolkit/feature.txt -independent > temp_out.txt")
		
		prob1 = getProb("temp_out.txt")
	
		f = open("temp.txt","w")
		f.write("%s\n"%" ".join([x.encode("utf-8") for x in hyp2]))
		f.close()
		os.system("~/Course/AMMML/project/FeatureAugmentedRNNToolkit/rnnlm -rnnlm ~/Course/AMMML/project/FeatureAugmentedRNNToolkit/model -test temp.txt -features-matrix ~/Course/AMMML/project/FeatureAugmentedRNNToolkit/feature.txt -independent > temp_out.txt")
			
		prob2 = getProb("temp_out.txt")

		#probs.append([prob1,prob2])
		fout.write("%f\t%f\n"%(prob1,prob2))
		fout2.write("%f\t%f\n"%(prob1/float(len(hyp1)),prob2/float(len(hyp2))))
	fout.close()
	fout2.close()
Beispiel #3
0
def formatting_features(obj):
	question = obj['question_text'].strip()
	topics   = [ t['name'] for t in obj['topics'] ]
	tokens   = [ w for w in wordpunct_tokenize(question) if not re.match(r'[\'\"\.\?\!\,\/\\\(\)\`]',w) ]
	punct    = [ p for p in wordpunct_tokenize(question) if re.match(r'[\'\"\.\?\!\,\/\\\(\)\`]',p) ]
	top_toks = set([ w.lower() for t in obj['topics'] 
						for w in wordpunct_tokenize(t['name']) ])
	qn_toks  = set(tokens)
	#qn_topic_words = len(top_toks & qn_toks)

	qn_mark   = 1 if "?" in question else -1 
	start_cap = 1 if re.match(r'^[A-Z]',question) else -1
	if tokens:
		qn_type = [ sum(1.0 for w in tokens if w in qws)
						for qws in qn_type_words ]
		nm_pres = sum(1.0 for w in tokens if w.lower() in names
							and re.match(r'^[A-Z]',w))
		pl_pres = sum(1.0 for w in tokens if w.lower() in places
							and re.match(r'^[A-Z]',w))
	else:
		qn_type = [0.0]*len(qn_type_words)
		nm_pres = -1.0
		pl_pres = -1.0

#	qn_somewhere =  1 if sum(qn_type) and (re.match(r'\?$',question)
#						or re.match(r'\?\s*[A-Z]',question)) else -1

	total_words = len(tokens)
	dict_words  = sum(1 for w in tokens if w.lower() in eng_words)
	correct_form_count = sum(1.0 for w in tokens
			if (w.lower() in eng_words and not re.match(r'^[A-Z]+$',w))
			or re.match(r'^[A-Z]',w)
		)
	question_form = 1 if '?' in punct and sum(1 for w in tokens if w in qn_words) else -1
	correct_form_ratio = correct_form_count/float(total_words+1)
	#topic_word_ratio   = qn_topic_words/float(total_words+1)
	name_ratio         = (nm_pres + pl_pres)/float(total_words+1)
	punctuation_ratio  = len(punct)/float(total_words+1)
	result = [
			#	1 if nm_pres else 0,
				nm_pres,
			#	1 if pl_pres else 0,
				pl_pres,
				qn_mark,
				start_cap,
			#	qn_somewhere,
				correct_form_ratio,
				#len(punct),
				punctuation_ratio,
		   		math.log(len(topics)+1),
		   		#len(topics),
				name_ratio,
			#	topic_word_ratio,
				dict_words,
			#	qn_topic_words,
			#	correct_form_count,
			#	math.log(total_words+1),
				total_words,
			] + qn_type
	return result
Beispiel #4
0
 def text_to_sentences(self, text, tokenizer, remove_stopwords=False ):
     print "text_to_sentence"
     #from nltk.tokenize import wordpunct_tokenize
     # Function to split a review into parsed sentences. Returns a 
     # list of sentences, where each sentence is a list of words
     #
     text=text.decode("utf8")
     from nltk.tokenize import sent_tokenize,wordpunct_tokenize
     # 1. Use the NLTK tokenizer to split the paragraph into sentences
     #raw_sentences = tokenizer.tokenize(text.strip())
     raw_sentences = sent_tokenize(text.strip())
     print "finish tokenize sentence",len(raw_sentences)
     #
     # 2. Loop over each sentence
     sentences = []
     for raw_sentence in raw_sentences:
         
         #print "sentence:",raw_sentence
         # If a sentence is empty, skip it
         if len(raw_sentence) > 0:
             # Otherwise, call review_to_wordlist to get a list of words
             #sentences.append( text_to_wordlist( raw_sentence, \
 #               remove_stopwords ))
             #print removePunctuation(raw_sentence).lower().split()
             print raw_sentence
             sentences.append(wordpunct_tokenize(raw_sentence))#raw_sentence.split())
             print wordpunct_tokenize(raw_sentence)
             #print  text_to_wordlist( raw_sentence, remove_stopwords )
     #    
     # Return the list of sentences (each sentence is a list of words,
     # so this returns a list of lists
     return sentences
def check_len_stats(std_dev):
    fraction = 0
    for i in range(1,5):
        fraction+=0.25
        count1 = 0
        count2 = 0
        mcount = 0
        ncount = 0
        threshold = fraction*std_dev
        print threshold
        with open(infile, 'r') as f:
            for line in f:
                mem_len = 0
                nonmem_len= 0
                if(line.strip().split('\t')[1]=='M'):
                    mem_len+=len(wordpunct_tokenize(line.strip().split('\t')[0]))
                    mcount +=1
                    if (float(mem_len) < threshold):
                        count1+=1
                else:
                    nonmem_len+=len(wordpunct_tokenize(line.strip().split('\t')[0]))
                    ncount+=1
                    if (float(nonmem_len) < threshold):
                        count2+=1
        f.close()
        print "iteration-" , i
        print "memorable quotes below threshold-", count1
        print "total memorable quotes-",mcount
        print "non-memorable quotes below threshold-",count2
        print "non memorable quotes-",ncount
def dice_sentence(sentence1, sentence2):
    """
    Determines the Dice value of two sentences

    :param sentence1:
    :param sentence2:
    :return: dice value
    """
    return dice(wordpunct_tokenize(sentence1), wordpunct_tokenize(sentence2))
Beispiel #7
0
def common_words(sent1,sent2):
    # remove stop words, lemmatise and return count of common words
    porter = PorterStemmer()
    #stop = stopwords.words('english')
    s1_words =  [porter.stem(i.lower()) for i in wordpunct_tokenize(sent1)  ]
    s2_words =  [porter.stem(i.lower()) for i in wordpunct_tokenize(sent2)  ]
    s1 = set(s1_words)
    s2 = set(s2_words)
    return len(s1.intersection(s2)) / ((len(s1)+0.1+len(s2))/2.0) # normalised 
def jaccard_sentence(sentence1, sentence2):
    """
    Determines jaccard value of two sentences

    :param sentence1:
    :param sentence2:
    :return: jaccard value
    """
    return jaccard(wordpunct_tokenize(sentence1), wordpunct_tokenize(sentence2))
Beispiel #9
0
    def load_memes (self, filenames):

        for filename in filenames:
            f = open(filename, 'r')
            contents = f.readlines()
            for entry in contents:
                fields = [s.strip() for s in entry.split("|")]
                meme_type = fields[0]
                top_text = wordpunct_tokenize(fields[1].lower())
                bottom_text = wordpunct_tokenize(fields[2].lower())
                self.memes[meme_type].append ((top_text, bottom_text))
    def generate_vocabulary(self, review_summary_file):
        self.rev_sum_pair = pd.read_csv(review_summary_file,header=0).values

        for review,summary in self.rev_sum_pair:
            rev_lst = wordpunct_tokenize(review)
            sum_lst = wordpunct_tokenize(summary)
            self.__add_list_to_dict(rev_lst)
            self.__add_list_to_dict(sum_lst)

        # Now store the "" empty string as the last word of the voacabulary
        self.map[""] = len(self.map)
        self.revmap[len(self.map)] = ""
def med_sentence(sentence1, sentence2, c1=1, c2=1, c3=1):
    """
    Determines minimum edit distance of two sentences.

    :param sentence1: first sentence
    :param sentence2: second sentence
    :param c1: optional weight
    :param c2: optional weight
    :param c3: optional weight
    :return: integer, minimum edit distance
    """

    return med(wordpunct_tokenize(sentence1), wordpunct_tokenize(sentence2), c1, c2, c3)
def features_from_dump(infile,variant,embeddings,bowfilter):
    frame = read_dump(infile)
    refstatements = [wordpunct_tokenize(st) for st in list(frame.Ref)]
    targetstatements = [wordpunct_tokenize(st) for st in list(frame.Target)]
    featuredicts = []

    for i in range(len(refstatements)):
        sp = StatementPair(i, refstatements[i], targetstatements[i], 0)
        commonwords, onlyref, onlytarget = sp._word_venn_diagram()
        trainingbow.update(onlyref)
        featuredicts.append(sp.featurize(variant, embeddings,bowfilter))

    return featuredicts
Beispiel #13
0
def main():

    # related_words = {
    #     'art':['art', 'arts', , 'op art', 'pop art', 'art deco', 'art form', 'art house', 'art-house', 'clip art', 'fine art', 'art gallery', 'art nouveau', 'art therapy',  'kinetic art', 'martial art', 'art director', 'conceptual art', "objet d'art", 'performance art', 'work of art', 'state-of-the-art', 'the black art', 'thou art', 'noble art', 'craft', 'craftsmanship', 'ingenuity', 'mastery', 'artistry', 'imagination', 'Biedermeier', 'Parian', 'Queen Anne', 'annulate', 'anomphalous', 'banded', 'chryselephantine', 'aperture', 'collared', 'artificial', 'condensed', 'camera', 'copied'],

    #     'sport':['athletcis', 'recreation', 'candidacy', 'championship', 'clash', 'contention', 'event', 'fight', 'game', 'match', 'race', 'rivalry', 'run', 'sport', 'sports', 'struggle', 'tournament', 'trial', 'basketball', 'football', 'soccer', 'badminton', 'archery', 'tennis', 'swim']
    # }
    result = dict()
    clubs = list(Club.objects.all())
    print len(clubs)

    for club in clubs:
        score = 0
        # try:
        if club.introduction:
            intro = club.introduction
        else:
            intro = ""
        name = club.name
        max_score = 0
        max_cat = None
        for category in CATEGORIES:
            all_words = wordpunct_tokenize(intro.lower())
            all_name_words = wordpunct_tokenize(name.lower())
            score = 0
            for word in determinstic_words[category]:
                score += all_words.count(word) * 2
                score += all_name_words.count(word) * 10
            if score > max_score:
                max_cat = category
                max_score = score

        if max_cat and max_score > 2:
            category = Category.objects.get(name=max_cat)
            club.categories.add(category)
            club.save()

            try:
                # print name, max_cat, max_score
                result[max_cat].append(name)
            except KeyError:
                result[max_cat] = [name]

    for category in CATEGORIES:
        print category
        try:
            for club in result[category]:
                print club
        except:
            pass
        print "\n"
Beispiel #14
0
def hypernym_count(sent1,sent2):

    s1_words =  [i.lower() for i in wordpunct_tokenize(sent1) ]
    s2_words =  [i.lower() for i in wordpunct_tokenize(sent2) ]
    s1_all = []
    s2_all = []

    for w in s1_words:
        s1_all.extend(get_hypernyms(w))
    for w in s2_words:
	s2_all.extend(get_hypernyms(w))
    w1_hypernym = len(set(s1_words).intersection(set(s2_all)))
    w2_hypernym = len(set(s2_words).intersection(set(s1_all)))
    return w1_hypernym-w2_hypernym
Beispiel #15
0
def frequencies(sentence_texts, stopword = False):
    #lower case
    out = sentence_texts.lower()
    
    #remove punctuation
    out = out.translate(string.maketrans("",""), string.punctuation) 
    
    #tokenize     
    out = wordpunct_tokenize(out) 
    
    #build Dictionary of key=word value=number of occurances
    frequencies = {}
    for word in out:
        if word not in frequencies:
            #if word is a stopword and stopword is on, do not add
            if not(stopword == True and word in stopwords.words('english')):
                frequencies[word] = 1
        else:
            frequencies[word] += 1
    
    #sort frequencies
    sorted_frequencies = sorted(frequencies.iteritems(), key=operator.itemgetter(1), reverse=True)

    #output largest frequency first
    return sorted_frequencies
def perplexity(f_cost, lines, worddict, options, verbose=False, wv_embs=None):
    n_lines = len(lines)
    cost = 0.
    n_words = 0.

    for i, line in enumerate(lines):
        # get array from line
        wordin = wordpunct_tokenize(line.strip())
        seq = [worddict[w] if w in worddict else 1 for w in wordin]
        seq = [s if s < options['n_words'] else 1 for s in seq]
        n_words += len(seq)+1
        x = numpy.array(seq+[0]).astype('int64').reshape([len(seq)+1,1])
        x_mask = numpy.ones((len(seq)+1,1)).astype('float32')
        if options['use_preemb']:
            shp = x.shape
            xi = wv_embs[x.flatten()].reshape([shp[0], shp[1], wv_embs.shape[1]])
        else:
            xi = x
        cost_one = f_cost(x, x_mask, xi, x_mask) * (len(seq)+1)
        cost += cost_one

        if verbose:
            print 'Sentence ', i, '/', n_lines, ' (', seq.mean(), '):', 2 ** (cost_one/len(seq)/numpy.log(2)), ', ', cost_one/len(seq)
    cost = cost / n_words
    return cost
Beispiel #17
0
	def calcFreq(self,cb,i):
		wordFreq = dict()
		path = "/home/mis/file_" + str(i) + ".txt"
		conteudo = self.reader.readerFile(path)
		print (":: Retirando Pontuação -> %s" % time.strftime("%d/%m/%Y %H:%M:%S"))
		make = str.maketrans(string.punctuation,'                                ')
		conteudoLimpo = conteudo.translate(make)
		del conteudo  
		print (":: Retirando espaço duplicados -> %s" % time.strftime("%d/%m/%Y %H:%M:%S"))
		conteudo = conteudoLimpo.strip()
		conteudoLimpo = re.sub(' +',' ',conteudoLimpo)

		print (":: Token -> %s" % time.strftime("%d/%m/%Y %H:%M:%S"))
		palavras = tokenize.wordpunct_tokenize(conteudoLimpo)
		print (":: Frequencia -> %s" % time.strftime("%d/%m/%Y %H:%M:%S"))
		frequencias = nltk.FreqDist(palavras)

		print (":: Monta WordFreq -> %s" % time.strftime("%d/%m/%Y %H:%M:%S"))
		for key in frequencias.keys():
			wordFreq[key.strip().lower()] = frequencias[key]

		self.palavras = self.palavras + palavras
		del conteudo
		del palavras
		del frequencias
		return wordFreq
def split_sentence_from_document(document):
    max_counts = 0
    for sent in tokenize.sent_tokenize(document):
        max_counts = max(max_counts, len(tokenize.wordpunct_tokenize(sent)))
    # if max_counts>4000:
    #     print(document)
    return max_counts
def tokenize(directory,exclude_files):
	full_content = ''
	for _file in os.listdir(directory):
		#disp_count = 5
		if exclude_files  and (_file in exclude_files):
			continue
		with open(directory+_file,'r') as f:
			contents = f.readlines()
			for item in contents:
				try:
					sentence = item.split('\t')[1].strip()
					full_content += sentence
				except IndexError:
					continue
				# if np.random.binomial(1,0.1):

				# 	print sentence
				# 	time.sleep(2)				
				# 	disp_count -=1 
				# 	if not disp_count:
				# 		print '*'*100
				# 		break
						
				# else:
				# 	print '#'

	return wordpunct_tokenize(full_content.lower())
Beispiel #20
0
def preprocess(line, is_lmz=False):
    line = wordpunct_tokenize(line.strip())
    if is_lmz:
        lemmatizer = WordNetLemmatizer()
        line = [lemmatizer.lemmatize(word) for word in line]

    return line
Beispiel #21
0
 def __init__(self, project_dict):
     self.pid = project_dict['id']
     self.blurb = project_dict['blurb'].lower()
     self.deadline = project_dict['deadline']
     self.category_id = project_dict['category']['id']
     self.category_desc = re.sub('/.*', '', project_dict['category']['slug'])
     self.reward_backer_tup = project_dict['reward_backer_tup'] 
     self.text = project_dict['full_description'].lower() + " " + project_dict['risk'].lower()
     self.tokens = np.array(wordpunct_tokenize(self.text))
     self.name = project_dict['name'] 
     self.url = project_dict['url'] 
     self.launched_at = project_dict['launched_at'] 
     self.pledged = project_dict['pledged']
     self.title = project_dict['title']
     self.no_dollars_raised = project_dict['no_dollars_raised']
     self.currency = project_dict['currency']
     self.no_backers = project_dict['no_backers']
     self.state = project_dict['state']
     self.deadline = project_dict['deadline']
     self.location = project_dict['location']
     self.backers_count = project_dict['backers_count']
     self.creator_url = project_dict['creator_url']
     self.backers_count = project_dict['backers_count']
     self.spotlight = project_dict['spotlight']
     self.goal = project_dict['goal']
     self.author = project_dict['author']
def json_converter_ifn_body(file_path):
    """
    Raw into json converter for IFN 
    Read and construct tokens list from test file.
    ARGS: file_path(file which you want to classify)
    """
    #queryファイルの読み込み
    line_flag=False;
    motif_flag=False;
    motif_stack=[];
    line_stack=[];
    with codecs.open(file_path, 'r', 'utf-8') as lines:
        for line in lines:
            if line==u'\n':
                continue;
            if line==u'#motif\n':
                motif_flag=True;
                continue;
            elif line==u'#text\n':
                motif_flag=False;
                line_flag=True;
                continue;
            if motif_flag==True and line_flag==False:
                motif_stack.append(line.strip());
            if line_flag==True and motif_flag==False:
                line_stack.append(line.strip());
    
    tokens_stack=[tokenize.wordpunct_tokenize(line) for line in line_stack]
    tokens_stack=[[t.lower() for t in l] for l in tokens_stack]
    #ここではstopwordsの除去はしない
    #if eliminate_stop==True: 
    #    tokens_stack=[[t for t in l if t not in stopwords and t not in symbols] for l in tokens_stack]
    #配列を二次元から一次元に落とす.ついでにlemmatizeも行う.
    #tokens_stack=[lemmatizer.lemmatize(t) for line in tokens_stack for t in line];
    return tokens_stack, motif_stack;
def pick_top(number, sortedLst, ratio):
    unigrams = []
    bigramsplus = []
    for element in sortedLst:
        tokens = wordpunct_tokenize(element[0])
        if len(tokens) is 1:
            unigrams.append(element)
        else:
            bigramsplus.append(element)

    #will be a list of the top *number* strings
    topList = []
    unigramIndex = 0
    bigramIndex = 0
    while len(topList) < number:
        if unigramIndex is len(unigrams):
            if bigramIndex is len(bigramsplus):
                break
            else:
                topList.append(bigramsplus[bigramIndex][0])
                bigramIndex += 1
        elif bigramIndex is len(bigramsplus):
            topList.append(unigrams[unigramIndex][0])
            unigramIndex += 1
        else:
            if unigrams[unigramIndex][1] * ratio < bigramsplus[bigramIndex][1]:
                topList.append(bigramsplus[bigramIndex][0])
                bigramIndex += 1
            else:
                topList.append(unigrams[unigramIndex][0])
                unigramIndex += 1

    return topList
def tokenStem(words):
    words = words.strip('[').strip(']').lower() #remove brackets and lowercase
    words = re.sub('[(){}<>:,.!?\'"]', '', words)
    stemmer = PorterStemmer()
    stops = stopwords.words('english')
    output = [stemmer.stem(token) for token in wordpunct_tokenize(words) if token not in stops ] #stem words
    return " ".join(output) #merge into strings
def tokenizeNoPunctuation(tweets):
    tokens = []
    stoplist = [',', '(', ')', '.', '?', '/', '+', ':', ';']
    for tweet in tweets:
        tokenized = wordpunct_tokenize(tweet)
        tokens.append([token for token in tokenized if token not in stoplist])
    return tokens
def best_dressed(year):
    if year not in yearMap.keys():
        prep_year(year)

    strings = yearMap[year]['strings']
    dressPattern = re.compile(r'(dress)|(red carpet)|(redcarpet)', re.IGNORECASE)
    posPattern = re.compile(r'(best)|(beautiful)|(stun)|(love)', re.IGNORECASE)
    negPattern = re.compile(r'(worst)|(bad)|(ugly)|(hate)', re.IGNORECASE)
    namePattern = re.compile(r'[A-Z]\w* [A-Z]\w*') 
    stoplist = ['new','red','carpet','redcarpet','globes','golden','best','worst','movie','motion','picture','film','drama','comedy','musical','cecil','demille','award','tv','performance', 'actress','actor','television','feature','foreign','language','supporting','role','director','original','series']

    dress_mentions = Counter()
    dress_mentions_neg = Counter()
    dress_mentions_pos = Counter()
    for tweet in strings:
        if re.search(dressPattern, tweet):
            matches = re.findall(namePattern, tweet)
            matches = (w.lower() for w in matches)
            for match in matches:
                match_words = wordpunct_tokenize(match)
   
                if match_words[0] not in stoplist and match_words[1] not in stoplist:
                    dress_mentions[match] += 1
                    if re.search(posPattern, tweet):
                        dress_mentions_pos[match] += 1
                    if re.search(negPattern, tweet):
                        dress_mentions_neg[match] += 1


    discussed_dress = dress_mentions.most_common(1)
    best_dress = dress_mentions_pos.most_common(1)
    worst_dress = dress_mentions_neg.most_common(1)

    return best_dress[0][0], worst_dress[0][0], discussed_dress[0][0]
Beispiel #27
0
def test_small_talk_filter(_bot_brain):
    bot, pos, lex = _bot_brain
    tester = wordpunct_tokenize("raining snowing sunny weather")
    weather_opts = ["Talking about the weather is such a bore.",
        "I'm not the weatherman!"]
    sentence = input_filters.filter_small_talk(tester)
    assert sentence in weather_opts
Beispiel #28
0
def test_filter_length_words():
    tester = wordpunct_tokenize("I am not happy but I am not hungry either.")
    seeds = input_filters.filter_length_words(tester)
    assert "I" not in seeds
    assert "hungry" in seeds
    assert "happy" in seeds
    assert "am" not in seeds
Beispiel #29
0
 def PredictReviewScore(self, sentences, label=0):
     """
     This method gives a score to a review.
     """
     AdjR = 0.0
     # if text.startswith("For more photos and reviews do check out fourleggedfoodies"):
     #     x = 1
     adjAll = []
     for sentence in sentences:
         adjectives, dependencies = self.ExtractSentDetails(sentence)
         adjAll.extend(adjectives)
         allAdjectives = adjectives | Angel.GlobalAdjList
         AdjS = 0.0
         words = wordpunct_tokenize(sentence["Text"])
         if len(words) <= 3:
             allAdjectives |= set([x.lower() for x in words])
         for i in range(len(words)):
             word = words[i].lower()
             if word in {"but", "if"}:
                 AdjS = 0.0
             elif word in allAdjectives and word in self.lexicon:
                 AdjS += float(self.lexicon[word]) * self.PredictMultiplier(word, dependencies[word], words, i)
         AdjR += AdjS
     AdjR *= self.PredictBase(adjAll)
     finalScore = AdjR
     if self.DumpRequested(finalScore, label):
         self.DumpDetails(sentences, label)
     return finalScore
 def Match(self, text):
     #tokeniz and normalize our text
     textArr = tokenize.wordpunct_tokenize(text.lower().strip())
     hits = 0
     results = []
     secondary = []
     #-tlength as we need to iterate over window size of words
     for ti in xrange(0,len(textArr)-self.tlength):
         
             for termT in self.toMatch:
                 #so whats the distance between our first token?
             
                 dist1 = editdist.distance(textArr[ti],termT[hits])
                 if  dist1 <= self.thresh:
                     if len(termT) <= 1:
                         print "got hit with %s"%termT
                         results.append(termT[hits])
                     else:
                         dist2 = editdist.distance(textArr[ti+1],termT[hits+1])
                         print "distance between %s and %s is %s" %(textArr[ti+1],termT[hits+1],dist2)
                         #WARNING: this will only work for 2-grams where the tlength is an n-gram.
                         if  dist2 <= self.thresh:
                             #we have a close hit lets check if the second term in tuple is a hit as well.
                             #hits = hits + 1
                             results.append("%s %s"%(termT[hits],termT[hits+1]))
                             #print termT
                             #print "got hit on term %s"%results
                     
             #looks like we've found a match
            
            
     #print secondary
     #we're done shit....
     return results
Beispiel #31
0
 def summarize(self):    
     
     self.sentences = sent_tokenize(self.text)
     
     self.tokenizedSentences = [] 
     for sentence in self.sentences:
         self.tokenizedSentences.append(Counter([word for word in wordpunct_tokenize(sentence) if word not in self.Puncts]))  
     
     self.b_matrix = self.CV.fit_transform(self.sentences)
     
     self.n_matrix = TfidfTransformer().fit_transform(self.b_matrix)
     
     self.sim_graph = self.n_matrix * self.n_matrix.T;
     
     self.sen_graph = nx.from_scipy_sparse_matrix(self.sim_graph)
     
     self.sen_scores = nx.pagerank(self.sen_graph)
     
     self.sorted_sentences_1 = sorted(self.sentences, key = lambda s: self.sen_scores[self.sentences.index(s)], reverse=True)
     
     self.sorted_sentences_2 = sorted(self.sorted_sentences_1[:5], key = lambda s: self.sentences.index(s))
Beispiel #32
0
    def featurize(self, input_str):
        input_str = gensim.utils.to_utf8(input_str,
                                         errors='replace').decode("utf8")
        doc = wordpunct_tokenize(input_str)
        doc = [w.lower() for w in doc]

        # Convert from tokens to word ids from the model dictionary.
        doc_bow = self.dict.doc2bow(doc)

        # Simply add up all the vectors and return.
        vec = self.model[doc_bow]
        col = []
        data = []
        for topicNum, value in vec:
            data.append(value)
            col.append(topicNum)

        row = [0 for _ in range(len(data))]
        vec = coo_matrix((data, (row, col)),
                         shape=(1, self.model.num_topics)).toarray()
        return vec
Beispiel #33
0
def get_msg_words(msg, stopwords=[], strip_html=False):
    """get msg workds"""
    msg = re.sub('3D', '', msg)

    if strip_html:
        msg = re.sub('<(.|\n)*?>', ' ', msg)
        msg = re.sub('&\w+;', ' ', msg)

    msg = re.sub('_+', '_', msg)

    msg_words = set(wordpunct_tokenize(msg.replace('=\n', '').lower()))

    # Get rid of stopwords
    msg_words = msg_words.difference(stopwords)

    # Get rid of punctuation tokens, numbers, and single letters.
    msg_words = [
        w for w in msg_words if re.search('[a-zA-Z]', w) and len(w) > 1
    ]

    return msg_words
Beispiel #34
0
def main():
    input_text = 'We will discuss briefly about the basic syntax,\
 structure and design philosophies. \
 There is a defined hierarchical syntax for Python code which you should remember \
 when writing code! Python is a really powerful programming language!'

    #synsets = wn.synsets('phone')
    #print [str(syns.definition()) for syns in synsets]

    synsets = wn.synsets('philosophies')
    for syns in synsets:
        print 'philosophies', '==>', syns.definition()

    synsets = wn.synsets('Python')
    for syns in synsets:
        print 'Python', '==>', syns.definition()

    op = word_tokenize(input_text)
    print '\nTokenize output', op

    print '\nStemming output'
    for e in op:
        if len(e) > 1:
            porter_stemmer = PorterStemmer()
            print porter_stemmer.stem(e)

    print '\nPOS', pos_tag(op)

    print('\nLemmatize output')
    lm = WordNetLemmatizer()
    for e in op:
        if len(e) > 1:
            print lm.lemmatize(e)

    print '\nTrigram output'
    trigrams = ngrams(op, 3)
    for grams in trigrams:
        print grams
    print '\nNamed Entity Recognization'
    print ne_chunk(pos_tag(wordpunct_tokenize(input_text)))
def simple_neg_pos_wc(df, column_name, sum=False):
    neg_words = pd.read_csv('negative-words.txt',
                            skiprows=36,
                            header=None,
                            encoding='ISO-8859-1')
    pos_words = pd.read_csv('positive-words.txt',
                            skiprows=36,
                            header=None,
                            encoding='ISO-8859-1')
    stop_words = set(
        stopwords.words('english')
    )  # these are nonsense words that don't belong in the wordcloud ('a','the' etc.)
    stemmer = SnowballStemmer(
        "english"
    )  # this stemmer will clip the end of words so that begins and begin etc. look the same

    stop_words.update(
        ['.', ',', '"', "'", '?', '!', ':', ';', '(', ')', '[', ']', '{',
         '}'])  # add some characters to the stopwords

    # for every entry in the column, get the list of words and stem them. remove stop words
    word_list = [
        stemmer.stem(i.lower())
        for i in wordpunct_tokenize(" ".join(df[column_name].dropna()))
        if i.lower() not in stop_words
    ]

    # correct any spelling mistakes introduced by the stemmer
    word_list = [spell(i) for i in word_list]
    pos_count = 0
    neg_count = 0
    for word in word_list:
        if (neg_words.loc[:, 0] == word).sum() > 0:
            neg_count -= 1
        if (pos_words.loc[:, 0] == word).sum() > 0:
            pos_count += 1
    if sum:
        return neg_count + pos_count
    else:
        return neg_count, pos_count, len(word_list)
Beispiel #36
0
def clean_text_simple(text, my_stopwords, punct, remove_stopwords=True, pos_filtering=True, stemming=True):
    text = text.lower()
    text = ''.join(l for l in text if l not in punct) # remove punctuation (preserving intra-word dashes)
    text = re.sub(' +',' ',text) # strip extra white space
    text = text.strip() # strip leading and trailing white space

    # tokenize (split based on whitespace)
    ### fill the gap (store results as 'tokens') ###
    tokens = wordpunct_tokenize(text)

    if pos_filtering == True:
        # POS tag and retain only nouns and adjectives
        tagged_tokens = pos_tag(tokens)
        tokens_keep = []
        for item in tagged_tokens:
            if (
            item[1] == 'NN' or
            item[1] == 'NNS' or
            item[1] == 'NNP' or
            item[1] == 'NNPS' or
            item[1] == 'JJ' or
            item[1] == 'JJS' or
            item[1] == 'JJR'
            ):
                tokens_keep.append(item[0])
        tokens = tokens_keep
    if remove_stopwords:
        # remove stopwords from 'tokens'
        ### fill the gap ###
        filtered_list = [w for w in tokens if not w in my_stopwords]
        tokens = filtered_list

    if stemming:
        # apply Porter's stemmer
        stemmer = PorterStemmer()
        tokens_stemmed = list()
        for token in tokens:
            tokens_stemmed.append(stemmer.stem(token))
        tokens = tokens_stemmed
    return(tokens)
def get_msg_words(msg, stopwords=[], strip_html=False):
    '''
    Returns the set of unique words contained in an e-mail message. Excludes 
    any that are in an optionally-provided list. 

    NLTK's 'wordpunct' tokenizer is used, and this will break contractions.
    For example, don't -> (don, ', t). Therefore, it's advisable to supply
    a stopwords list that includes contraction parts, like 'don' and 't'.
    '''

    # Strip out weird '3D' artefacts.
    msg = re.sub('3D', '', msg)

    # Strip out html tags and attributes and html character codes,
    # like &nbsp; and &lt;.
    if strip_html:
        msg = re.sub('<(.|\n)*?>', ' ', msg)
        msg = re.sub('&\w+;', ' ', msg)

    # wordpunct_tokenize doesn't split on underscores. We don't
    # want to strip them, since the token first_name may be informative
    # moreso than 'first' and 'name' apart. But there are tokens with long
    # underscore strings (e.g. 'name_________'). We'll just replace the
    # multiple underscores with a single one, since 'name_____' is probably
    # not distinct from 'name___' or 'name_' in identifying spam.
    msg = re.sub('_+', '_', msg)

    # Note, remove '=' symbols before tokenizing, since these are
    # sometimes occur within words to indicate, e.g., line-wrapping.
    msg_words = set(wordpunct_tokenize(msg.replace('=\n', '').lower()))

    # Get rid of stopwords
    msg_words = msg_words.difference(stopwords)

    # Get rid of punctuation tokens, numbers, and single letters.
    msg_words = [
        w for w in msg_words if re.search('[a-zA-Z]', w) and len(w) > 1
    ]

    return msg_words
Beispiel #38
0
def detect_num(sentence):
    new_sentence = ''
    tokens = wordpunct_tokenize(sentence)
    ordinal_num = ['st', 'nd', 'rd', 'th']
    for i in tokens:
        if i.isalpha() == False:
            if i[-2:] in ordinal_num:

                new_sentence = new_sentence + num2words(
                    int(i[:-2]), to='ordinal').replace('-', ' ') + ' '
                #print('ordinal num\n', new_sentence)
            elif len(i) == 4:
                if '0' == i[1] and i[2] != '0':
                    new_sentence = new_sentence + num2words(int(i)).replace(
                        ' and ', ' ') + ' '
                else:
                    new_sentence = new_sentence + num2words(
                        int(i), to='year').replace('-', ' ') + ' '
            elif 's' in i:

                new_sentence = new_sentence + num2words(int(
                    i[:-1]), to='year').replace('-', ' ') + ' '
                #print('year\n', new_sentence)
            elif i.isdigit():
                new_sentence = new_sentence + num2words(int(i)).replace(
                    '-', ' ') + ' '
            else:
                word = ''
                for char in range(0, len(i)):
                    if i[char].isalpha(
                    ) or i[char] == ':' or i[char] == '[' or i[char] == ']':
                        word = word + i[char]
                    else:
                        word = word + num2words(int(i[char]))
                new_sentence = new_sentence + word + ' '
                #print('default\n',new_sentence)
        else:
            new_sentence = new_sentence + i + ' '
    new_sentence = new_sentence.strip(' ')
    return new_sentence
Beispiel #39
0
    def create_naive_bayes_model_pickle(self):
        feature_set = []
        print "Feature set creating. !"
        for i in range(1,4,1):
            file_name = "data_set"+str(i)+".txt"
            file = open(self.data_set_dir+file_name,'r')
            j=0
            for line in file: 
                print "txt%s%s" %( i,j)
                j+=1
                words = wordpunct_tokenize(line)
                if words[len(words)-1] == "1" :
                    label = "pos"
                    reverse_label = "neg"
                elif words[len(words)-1] == "0" :
                    label = "neg"
                    reverse_label = "neg"
                else:
                    label = "neu"
                    reverse_label = "neu"
                del words[len(words)-1]
                words = self.clean_words(words)
                feature_set.append((self.create_feature_set(words)[0],label))
                feature_set.append((self.create_feature_set(words)[1],reverse_label))
        print "Feature set created. !"
        random.shuffle(feature_set)
        feature_set += self.twitter_data_training()
        # feature_set += self.moviereview_data_training()
        training_set = feature_set[:14500]
        testing_set = feature_set[14500:]
        testing_set = training_set;
        print "Training..."

        classifier = nltk.classify.NaiveBayesClassifier.train(training_set)
       
        file = open(self.naive_base_model, 'wb')
        pickle.dump(classifier, file)
        file.close()

        print "Accuracy:" + str(nltk.classify.accuracy(classifier, testing_set))
Beispiel #40
0
    def analyze_composite(self, sentence, weightedLexicon):
        if weightedLexicon:
            lexicon = self.weightedLexicon
        else:
            lexicon = self.unweightedLexicon

        tweetScore = 0
        words = wordpunct_tokenize(sentence)

        for index, word in enumerate(words):
            term = word
            if len(words[index:]) < 6:
                maxim = index + len(words[index:]) - 1
            else:
                maxim = index + 5
            for i in range(index + 1, maxim + 1):
                term = term + " " + words[i]
                #print term + ":" + str(self.lookUpWordScore(term, lexicon))
                tweetScore = tweetScore + self.lookUpWordScore(
                    term, lexicon, False)

        return tweetScore
def neg_pos_inv(df, column_name):
    neg_words = pd.read_csv('negative-words.txt',
                            skiprows=36,
                            header=None,
                            encoding='ISO-8859-1')
    pos_words = pd.read_csv('positive-words.txt',
                            skiprows=36,
                            header=None,
                            encoding='ISO-8859-1')
    inv_words = ['not', 'lack of', 'only', 'can\'t', 'no', 'more']
    #stop_words = set(stopwords.words('english'))  # these are nonsense words
    stop_words = ([
        '.', ',', '"', "'", '?', '!', ':', ';', '(', ')', '[', ']', '{', '}',
        'enhanced'
    ])

    new_df = df[column_name].dropna().reset_index()
    new_df['sentiment'] = 0
    negation = False
    prev = None
    pprev = None
    for i in range(len(new_df)):
        word_list = [
            i.lower() for i in wordpunct_tokenize("".join(
                new_df['prison_service_facilities_other_thoughts'][i]))
            if i.lower() not in stop_words
        ]
        for word in word_list:
            if ((pos_words.loc[:, 0] == word).sum() >
                    0) and (prev not in inv_words):
                new_df['sentiment'][i] += 1
            if ((pos_words.loc[:, 0] == word).sum() >
                    0) and (prev in inv_words or pprev in inv_words):
                new_df['sentiment'][i] -= 1
            if (neg_words.loc[:, 0] == word).sum() > 0:
                new_df['sentiment'][i] -= 1
            prev = word
            ppev = prev
    return new_df['sentiment'].sum()
Beispiel #42
0
    def featurize(self, input_str, num_best=None, use_reverse_index=True):
        """
        Returns similar documents by cosine similarity based on TF-IDF score.
        If num_best is left as None, returns a numpy.array with a score for
        every document in the corpus. Otherwise, it returns the top-K scored
        items as a list of (doc_idx, score) tuples.
        If use_reverse_index is set to False, the forward index is used (and
        the full corpus is queried). This is only a good idea when the number
        of terms in the input string is big, such as the text of a long article.
        For short documents, using the reverse index is usually much faster.
        """

        logger.debug("input string: %s", input_str)

        # Tokenize the input string.
        input_str = utils.to_utf8(input_str, errors='replace').decode("utf8")
        doc = wordpunct_tokenize(input_str)
        doc = [w.lower() for w in doc]

        # Convert from tokens to word ids from the model dictionary.
        doc_bow = self.dictionary.doc2bow(doc)

        # Get TF-IDF score for the document words (this does not update the TF-IDF model itself).
        doc_tfidf = self.tfidf[doc_bow]

        # Calculate similarity scores.
        self.similarity_index.use_reverse_index = use_reverse_index
        similar_docs = self.similarity_index[doc_tfidf]

        # Fall back to self.num_best if it wasn't specified here.
        if num_best is None:
            num_best = self.num_best
        if num_best is None:
            return similar_docs

        # Return top-k if requested.
        return heapq.nlargest(num_best,
                              enumerate(similar_docs),
                              key=lambda item: item[1])
def extract_tag_n_grams_and_add_to_dict(data_dir):
    listing = os.listdir(data_dir)
    list = []
    for filename in listing:
        with open(data_dir + filename, 'r') as f:
            tag_list_for_line = []
            for line in f:
                if (':' in line):
                    line_list = wordpunct_tokenize(line)
                    tag = line_list[-1]
                    tag_list_for_line.append(tag)
                else:
                    if (len(tag_list_for_line) > 0):
                        add_to_dict_n_gram_tags(dict_unigram,
                                                tag_list_for_line)
                        bigrams = ngrams(tag_list_for_line, 2)
                        add_to_dict_n_gram_tags(dict_bigram, bigrams)
                    tag_list_for_line = []

    list.append(dict_unigram)
    list.append(dict_bigram)
    return list
def json_to_conll(corpus_json_location, output_location, by_sent = False):
    with codecs.open(corpus_json_location, encoding='utf-8') as in_file:
        reviews = map(json.loads, in_file.readlines())

    with codecs.open(output_location, 'w', encoding='utf-8') as out_file:
        for review in reviews:
            documents = sent_tokenize(review['text']) if by_sent else [review['text']]
            w_start = 0
            w_end = 0
            for document in documents:
                tokens = wordpunct_tokenize(document)
                corrected_tokens = map(correct, tokens)
                pos_tags = tagger.tag(corrected_tokens)
                for token, temp in zip(tokens, pos_tags):
                    token_corr = temp[0]
                    pos_tag = temp[1]
                    w_start, w_end, delimitter = get_token_position_in_text(token, w_start, review['text'])
                    bio_tag = get_bio_tag(w_start, w_end, review['entities'])
                    lemm = lemmatizer.lemmatize(token_corr, get_wordnet_pos(pos_tag))
                    out_file.write(u'{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\n'.format(token, lemm, pos_tag, bio_tag, w_start, w_end, delimitter, review['id']))
                    w_start = w_end - 1
                out_file.write('\n')
Beispiel #45
0
 def get_keywords(self, document):
     '''
     groups keywords which are separated by stop words and punctuation
     '''
     sentences = sent_tokenize(document)
     candidate_keywords = []
     for i, sentence in enumerate(sentences):
         curr_keyword = []
         tokens = wordpunct_tokenize(sentence)
         # normalize case and remove punctuation
         words = [w.lower() for w in tokens if w.isalnum()]
         for word in words:
             if word not in self.stopwords:
                 curr_keyword.append(word)
             else:
                 if curr_keyword != []:
                     candidate_keywords.append({
                         'keyword_list': curr_keyword,
                         'sentence_num': i,
                     })
                     curr_keyword = []
     return candidate_keywords
def normalize_doc(data):
    for i in range(1, len(data)):
        temp = []
        temp2 = wordpunct_tokenize(data[i])
        l = 0.0
        for word in temp2:
            if word not in temp:
                temp.append(word)

        for word in temp:
            if word in tf_data:
                l = l + pow(tf_data[word], 2)

        l = sqrt(l)
        for word in temp:
            if word in tf_data:
                tf_data[word] /= l
        '''
			tf-idf score
		'''
        for key in tf_data.keys():
            tf_idf_data[key] = tf_data[key] * idf_data[key]
Beispiel #47
0
def get_instagram_caption_terms(hashtag):
    search_url = 'https://www.instagram.com/explore/tags/' + hashtag + '/?__a=1'
    contents = urllib2.urlopen(search_url).read()
    results = json.loads(contents)
    edges = results['graphql']['hashtag']['edge_hashtag_to_media']['edges']
    for edge in edges:
        captions = edge['node']['edge_media_to_caption']['edges']
        for caption in captions:
            text = caption['node']['text']
            words = [
                i.lower()
                for i in wordpunct_tokenize(text.encode('ascii', 'ignore'))
                if i.lower() not in stop_words
            ]
            all_tokens = ' '.join(words)
    texts = words
    # remove words that appear only once
    tokens_once = set(words for words in set(all_tokens)
                      if all_tokens.count(words) == 1)
    texts = [[words for words in texts if words not in tokens_once]
             for words in all_tokens]
    return texts
    def pre_process_data(self):

        word_dict = dict()
        data_matrix = dict()
        word_index = 0

        pos_files = gl.glob('pos\*.txt')
        neg_files = gl.glob('neg\*.txt')

        self.pos_files = pos_files

        #pos_files = pos_files[0:100] # 2000
        #neg_files = neg_files[0:100] # 2000

        pos_files.extend(neg_files)

        self.all_files = pos_files

        for fl in pos_files:
            review_text = self.remove_stop_words(fl)
            review_tokenized = wordpunct_tokenize(review_text)
            for word in review_tokenized:
                if word not in word_dict:
                    word_dict[word] = word_index
                    word_index = word_index + 1
                if (word, fl) not in data_matrix.keys():
                    data_matrix[(word, fl)] = 1
                else:
                    data_matrix[(word, fl)] = data_matrix[(word, fl)] + 1

        self.data_matrix = dok_matrix((len(word_dict.keys()), len(pos_files)))
        for word, fl in data_matrix.keys():
            word_index = word_dict[word]
            doc_index = pos_files.index(fl)
            self.data_matrix[word_index, doc_index] = data_matrix[(word, fl)]

        savemat('dm.mat', mdict={'arr': self.data_matrix})
        self.vocabulary = list(word_dict.keys())
        self.data_matrix = self.data_matrix.transpose()
def unigramPerplexity():
    global filename, totalLines, tokens, index
    with open(filename) as file:
        perplexities = []
        for line in file:
            listOfWords = wordpunct_tokenize(line)
            l = len(listOfWords)
            prob = []
            for i in range(l):
                word = listOfWords[i]
                prob.append(wordDict[word][1] / float(tokens))
            per = 1
            for p in prob:
                per = per * p
            if per != 0:
                per = 1 / float(per)
            perplexities.append(pow(per, 1 / float(l)))
    PP = 0
    for i in perplexities:
        PP = PP + i
    PP = PP / float(len(perplexities))
    return PP
Beispiel #50
0
	def tokenize_short_text(self, raw_tweet_text):

		tweet_text = raw_tweet_text
		#tweet_text = tweet_text.strip()
		#tweet_text = unidecode.unidecode(tweet_text)
		
		if self.args.use_lowercase:
			tweet_text = tweet_text.lower()
		
		if self.tokenizer > 0:
			if self.tokenizer == 1:
				uttterance_tokens = word_tokenize(tweet_text)
			if self.tokenizer == 2:
				uttterance_tokens = wordpunct_tokenize(tweet_text)
			if self.tokenizer == 3:
				uttterance_tokens = self.tweet_tokenizer.tokenize(tweet_text)
			if self.tokenizer == 4:
				tweet_text = clean(tweet_text)
				tweet_text = self.remove_accented_chars(tweet_text)
				uttterance_tokens = self.tweetokenizer.tokenize(tweet_text)
				uttterance_tokens = self.remove_duplicated_sequential_words(uttterance_tokens)
				uttterance_tokens = self.remove_stopwords(uttterance_tokens)

			if self.tokenizer == 5:
				tweet_text = tokenize(' '.join(self.tweet_tokenizer.tokenize(tweet_text)))
				return tweet_text
			
			if self.tokenizer == 6:
				tweet_text = clean(' '.join(self.tweet_tokenizer.tokenize(tweet_text)))
				return tweet_text

			if self.stem:
				uttterance_tokens = [list(map(self.stemmer.stem, sub)) for sub in uttterance_tokens]
			if self.lemmatize:
				uttterance_tokens = [[self.lemmatizer.lemmatize(tok, pos='v') for tok in sub] for sub in uttterance_tokens]
			
			tweet_text = " ".join(uttterance_tokens)
		
		return tweet_text
Beispiel #51
0
def normalize_hospital_name(name):
    """Normalizes a given hospital name.
        1. Converts all words to lower case.
        2. Removes all stopwords.

    Args:
        name -- Name to be normalized.

    Returns:
        slug -- The normalized hospital name.
    """
    normalized_name = name.lower()

    stopword_list = stopwords.words('english')
    filtered_words = [
        word for word in wordpunct_tokenize(normalized_name)
        if word not in stopword_list
    ]

    slug = slugify(' '.join(filtered_words))

    return slug
def createBigram():
    global filename, totalLines, tokens, index
    with open(filename) as file:
        for line in file:
            listOfWords = wordpunct_tokenize(line)
            l = len(listOfWords)
            if l != 0:
                word = listOfWords[0]
                key = str(["", word])
                if key not in matrix:
                    matrix[key] = 1
                else:
                    matrix[key] += 1
                # matrix[V][wordDict[word][0]]+=1
            for i in range(l - 1):
                word = listOfWords[i]
                next_word = listOfWords[i + 1]
                key = str([word, next_word])
                if key not in matrix:
                    matrix[key] = 1
                else:
                    matrix[key] += 1
Beispiel #53
0
def add_subword(sentence, subword_dict=None, additional_dict=None, dropout=1):
    """
    this will make an n-gram of the words in the sentence.
    you can easily add stuff by using additional_dict.
    Any if the word contians the key the value (must be int) will be +1. 

    dropout is 1 by default if your training you can change this to accomodate regularization
    the value of dropout is the percentage to keep

    """
    if subword_dict is None:
        subword_dict = {
            abc: i
            for i, abc in enumerate('abcdefghijklmnopqrstuvwxyz0123456789?')
        }

    if additional_dict is None:
        additional_dict = {}
        # examples
        additional_dict["en"] = 38
        additional_dict["sch"] = 39

    for key in additional_dict:
        value = additional_dict[key]
        assert type(value) == int
        subword_dict[key] = value

    max_value = find_max_sub_dict_value(subword_dict)

    sentence = [word for word in wordpunct_tokenize(sentence)]
    subword = np.zeros((len(sentence), max_value))

    for word_num, word in enumerate(sentence):
        for key in subword_dict:
            if word.__contains__(key):
                if np.random.rand() > dropout:
                    subword[word_num, subword_dict[key]] += 1

    return subword
def tokenize_and_stem(text, stemmer=SnowballStemmer("english")):
    """Word and sentence tokenization function that utilizes the Snowball Stemmer

    Args:
        text (string): All the reviews for a single user concatenated into a single string
        stemmer (Stemmer): The stemmer to be used
    Returns:
        stems (list): The filtered and stemmed tokens

    """
    # first tokenize by sentence, then by word to ensure that punctuation is caught as it's own token
    tokens = [
        word for sent in nltk.sent_tokenize(text)
        for word in wordpunct_tokenize(sent)
    ]
    filtered_tokens = []
    # filter out any tokens not containing letters (e.g., numeric tokens, raw punctuation)
    for token in tokens:
        if re.search('[a-zA-Z]', token):
            filtered_tokens.append(token)
    stems = [stemmer.stem(t) for t in filtered_tokens]
    return stems
    def create_dtm(self):
        dtm = []
        for tweet in self.data:

            # Make empty row
            newrow = dict()
            for term in self.top_words.keys():
                newrow[term] = 0

            tweetwords = [
                self.porter.stem(i.lower()) for i in wordpunct_tokenize(tweet)
                if i.lower() not in self.stop_words
                and not i.lower().startswith('http')
            ]

            for word in tweetwords:
                if word in self.top_words.keys():
                    newrow[word] += 1

            dtm.append(newrow)

        self.dtm = dtm
    def tokenize(self, document):
        # Break the document into sentences
        for sent in sent_tokenize(document[2:]):
            # Break the sentence into part of speech tagged tokens
            for token, tag in pos_tag(wordpunct_tokenize(sent)):
                # Apply preprocessing to the token
                token = token.lower() if self.lower else token
                token = token.strip() if self.strip else token
                token = token.strip('_') if self.strip else token
                token = token.strip('*') if self.strip else token

                # If stopword, ignore token and continue
                if token in self.stopwords:
                    continue

                # If punctuation, ignore token and continue
                if all(char in self.punct for char in token):
                    continue

                # Lemmatize the token and yield
                lemma = self.lemmatize(token, tag)
                yield lemma
Beispiel #57
0
    def _build_vocab(self):
        word_index = {}
        for doc in self.corpus:
            for sentence in sent_tokenize(doc):
                tokens = wordpunct_tokenize(sentence)
                tokens = [token.lower().strip() for token in tokens]
                tokens = [
                    token for token in tokens
                    if re.match('^[a-z]+$', token) is not None
                ]
                for token in tokens:
                    word_index[token] = word_index.get(token, 0) + 1

        filtered_word_index = {}
        # i= 0 for empty, 1 for OOV
        i = 2
        for word, count in word_index.items():
            if count >= Preprocess.MIN_WD_COUNT:
                filtered_word_index[word] = i
                i += 1
        print('Found %s unique tokens.' % len(filtered_word_index))
        return filtered_word_index
Beispiel #58
0
def parse_description(vid_text, nlp, parser):
    vid_text = sanitize_text(vid_text)
    raw_sentences = sentence_splitter.tokenize(vid_text)
    try:
        sentences = [
            ' '.join([w for w in wordpunct_tokenize(s)
                      if set(w) - punct_set]).replace(' .', '.')
            for s in raw_sentences
        ]
        # sentences = raw_sentences
        # print('here', sentences)
        # docs = [nlp(sent) for sent in sentences]

        # noun_phrase_chunks = {
        #     'chunks': [[(np.start, np.end) for np in doc.noun_chunks] for doc in docs],
        #     'named_chunks': [[np.text for np in doc.noun_chunks] for doc in docs]
        # }
        # constituent_parse = const_parse(vid_text, parser)
        constituent_parse = [
            list(i)[0] for i in parser.raw_parse_sents(sentences)
        ]
        # return constituent_parse
        # print([s.leaves() for s in constituent_parse])
        noun_phrase_chunks = np_chunker(vid_text, constituent_parse)
    except IndexError:
        # sentences = [' '.join([w for w in word_tokenize(s) if set(w) - punct_set]).replace(' .',  '.') for s in raw_sentences]
        constituent_parse = [
            list(i)[0] for i in parser.raw_parse_sents(raw_sentences)
        ]
        noun_phrase_chunks = np_chunker(vid_text, constituent_parse)
    pos_tags = [sent.pos() for sent in constituent_parse]
    # pos_tags = [(token.text, token.pos_, token.string) for token in doc]
    pos_tags = [item for sublist in pos_tags for item in sublist]
    parses = {
        'noun_phrase_chunks': noun_phrase_chunks,
        'pos_tags': pos_tags,
    }
    return parses
Beispiel #59
0
    def __init__(self, fileName):
        '''
        Parse the html document content to sentences

        :param htmlFileName: path of html file to be parsed
        '''
        tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')
        # if fileName.endswith('.htm'):
        #     with open(fileName) as file:
        #         self.filePath = fileName
        #         html_doc = file.read()
        #         soup = BeautifulSoup(html_doc, "lxml")
        #         self.title = soup.title.string
        #
        #         # get contents from the html without section titles
        #         # ignoring tags like <h1> <h2>
        #         paragraphs = []
        #         for paragraph in soup.find_all('p'):
        #             paragraphs.append(paragraph.get_text())
        #         data = "\n".join(paragraphs)
        #         self.rawLines = tokenizer.tokenize(data)
        if fileName.endswith('.txt'):
            with open(fileName) as file:
                self.filePath = fileName
                self.title = file.readline()
                lines = file.readlines()

                lines = [line.strip().replace('\xe2\x80\x83', ' ') for line in lines if line]

                # doc = ' '.join(lines)
                self.rawLines = []
                for line in lines:
                    self.rawLines += tokenizer.tokenize(line.decode('utf-8'))

            self.rawLines_stem = [' '.join([ps.stem(word) for word in wordpunct_tokenize(sentence)]) for sentence in self.rawLines]

        else:
            print 'Error, unable to read file', fileName
Beispiel #60
0
def get_individual_rhymes(sonnets):
    all_rhymes = []
    for sonnet in sonnets:
        tokens = [wordpunct_tokenize(s) for s in sonnet]
        punct = set(['.', ',', '!', ':', ';', '?', '(', ')'])
        filtered = [ [w for w in sentence if w not in punct ] for sentence in tokens]
        last = [ sentence[len(sentence) - 1] for sentence in filtered]




        # now that we have a list of the last words, check the sonnets
        # specifically if it is the ababcdcdefefgg or the other scheme
        if (len(last) == 14):
          pairs = [[last[0], last[2]], [last[1], last[3]], \
                      [last[4], last[6]], [last[5], last[7]], \
                      [last[8], last[10]], [last[9], last[11]], \
                      [last[12], last[13]]]
          all_rhymes += pairs
        else:
          print(f"Some weird sonnet appeared with length {len(last)}!")

    return all_rhymes