def run(self):
        """
        How do I run this Task?
        Luigi will call this method if the Task needs to be run.
        """
        # remove stop words and punctuation
        stop = set(stopwords.words('english'))
        tokenizer = RegexpTokenizer(r'\w+')
        wordnet = WordNetLemmatizer()

        docs = []

        #ipdb.set_trace()

        for f in self.input(): # The input() method is a wrapper around requires() that returns Target objects
            lines = 0
            words = []

            for line in f.open('r'):
                if lines == 0:
                    label = line
                    lines +=1
                else:
                    words.extend(tokenizer.tokenize(line))
                    lines +=1

            words_filtered = filtered_words = [wordnet.lemmatize(w) for w in words if not w in stopwords.words('english')]
            docs.append((label, '\t'.join(words)))

        out = self.output().open('w')
        for label, tokens in docs:
            out.write("%s,%s\n" % (label.strip(), tokens.strip()))
        out.close()
    def map(self): 
        mc=MongoClient('ec2-52-0-148-244.compute-1.amazonaws.com',27017)
        dbmc=mc.genid
        idoc=dbmc.gentable.find_one_and_update(filter={},update={ "$inc": { "score": 1 } },upsert=True);
        k=Key(self.bucket)
        y=stopwords.words('english')
        i=1
        strx=str(int(idoc['score']))
        strz=None
        filestring=""
        for line in sys.stdin:
 
            
            line = unicode(line, "utf-8","ignore")
            pattern = re.compile(r'\b(' + r'|'.join(y) + r')\b\s*')
            line = pattern.sub('', line)
            

            tokenizer = RegexpTokenizer(r'\w+')
            words=tokenizer.tokenize(line)
            strz=strx+'a'+str(i)
            k.key=strz
            filestring=line+'\n'
            k.set_contents_from_string(filestring)
            for word in words:
                word=word.encode(encoding='UTF-8',errors='ignore')
                
                print '%s\t%s' % (word.strip(), strz)
            i+=1
 def __init__(self, rtepair, stop=True, lemmatize=False):
     """
     @param rtepair: a L{RTEPair} from which features should be extracted
     @param stop: if C{True}, stopwords are thrown away.
     @type stop: C{bool}
     """
     self.stop = stop
     self.stopwords = set(['a', 'the', 'it', 'they', 'of', 'in', 'to',
                           'have', 'is', 'are', 'were', 'and', 'very', '.',','])
     
     self.negwords = set(['no', 'not', 'never', 'failed' 'rejected', 'denied'])
     # Try to tokenize so that abbreviations like U.S.and monetary amounts
     # like "$23.00" are kept as tokens.
     from nltk.tokenize import RegexpTokenizer
     tokenizer = RegexpTokenizer('([A-Z]\.)+|\w+|\$[\d\.]+')
     
     #Get the set of word types for text and hypothesis
     self.text_tokens = tokenizer.tokenize(rtepair.text)
     self.hyp_tokens = tokenizer.tokenize(rtepair.hyp)
     self.text_words = set(self.text_tokens)
     self.hyp_words = set(self.hyp_tokens)
     
     if lemmatize:
         self.text_words = set([lemmatize(token) for token in self.text_tokens])
         self.hyp_words = set([lemmatize(token) for token in self.hyp_tokens])
     
     if self.stop:
         self.text_words = self.text_words - self.stopwords
         self.hyp_words = self.hyp_words - self.stopwords
         
     self._overlap = self.hyp_words & self.text_words
     self._hyp_extra = self.hyp_words - self.text_words
     self._txt_extra = self.text_words - self.hyp_words
def lemmatizeall(word_list):
  """ Lemmatizes the word_list passing through each type of word

  Input: 
    word_list - list of words to be cleaned
    
    pos options: ADJ, ADJ_SAT, ADV, NOUN, VERB = 'a', 's', 'r', 'n', 'v'
  """
  word_types = "v", "a", "n", "s", "r"
  #print(word_types)
  #ipdb.set_trace() 
  wnl = nltk.WordNetLemmatizer()
  
  tokenizer = RegexpTokenizer(r'\w+')
  for x in range(0, len(word_list)):   
      
      word_tokens = tokenizer.tokenize(str(word_list[x]))
      word_tokens_lem = word_tokens
      for i in range(0, len(word_types)):
      
          pos = word_types[i]      
          word_tokens_lem = [wnl.lemmatize(w, pos=pos) for w in word_tokens_lem]
          
      sep = " "
      word_list[x] = sep.join(word_tokens_lem)
   
          #print(i)
  return word_list #[wnl.lemmatize(w, pos=pos) for w in word_list]  
Example #5
0
def textToWordList(txt):
    p_stemmer = RussianStemmer()
    tokenizer = RegexpTokenizer(r'\w+')
    stop_w = [p_stemmer.stem(i) for i in get_stop_words('ru')]
    r = re.compile('^[а-я]+$')
    badword =[
        'дом',
        'город',
        "дорог",
        "час",
        "ноч",
        "слов",
        "утр",
        "стран",
        "пут",
        "путешеств",
        "мест",
        'нов',
        "друз",
        "добр"
    ]
    txt = txt.lower().replace("<br>", "\n")
    tokens = [p_stemmer.stem(i) for i in tokenizer.tokenize(txt)]
    tokens = [i for i in tokens if not i in stop_w and r.match(i) and not i in badword]
    return tokens
Example #6
0
	def createLDAModel(texts, n_topics, n_passes):
		"""Generates a LDA model from an array of texts
		"""
		tokenizer = RegexpTokenizer(r'\w+')
		#Create EN stop words list
		en_stop = get_stop_words('en')
		#Create p_stemmer of class PorterStemmer
		p_stemmer = PorterStemmer()

		texts_ = []

		# loop through document list
		for i in texts:
		    
		    # clean and tokenize document string
		    raw = i.lower()
		    tokens = tokenizer.tokenize(raw)
		    
		    # remove stop words from tokens
		    stopped_tokens = [i for i in tokens if not i in en_stop]
		    # stem tokens
		    stemmed_tokens = [p_stemmer.stem(i) for i in stopped_tokens]
		    # add tokens to list
		    texts_.append(stemmed_tokens)

		# turn our tokenized documents into a id <-> term dictionary
		dictionary = corpora.Dictionary(texts_)

		# convert tokenized documents into a document-term matrix
		corpus = [dictionary.doc2bow(text) for text in texts_]

		# generate LDA model
		ldamodel = gensim.models.ldamodel.LdaModel(corpus, num_topics=n_topics, id2word = dictionary, passes=n_passes)

		return(ldamodel)
def lda(data):
	data = get_only_text(data)
	only_tweet = data
	length = len(only_tweet)
	length = min(20,length)
	for i in xrange(0,length):
		print i
		print only_tweet[i]
	return
	
	tokenizer = RegexpTokenizer(r'\w+')
	en_stop = get_stop_words('en')
	p_stemmer = PorterStemmer()

	length = len(only_tweet)
	length = min(20,length)
	total_texts = []
	for i in xrange(0,length):
		print only_tweet[i]
		print 
		to_lower = only_tweet[i].lower()
		tokens = tokenizer.tokenize(to_lower)
		stopped_tokens = [k for k in tokens if not k in en_stop]
		texts = [p_stemmer.stem(k) for k in stopped_tokens]
		total_texts.append(texts)

	dictionary = corpora.Dictionary(total_texts)
	corpus = [dictionary.doc2bow(text) for text in total_texts]

	ldamodel = gensim.models.ldamodel.LdaModel(corpus, num_topics=2, id2word = dictionary, passes=20)
	result =  ldamodel.print_topics(num_topics=2, num_words=1)
	for i in result:
		print i
Example #8
0
def generate_stemmed_tokens(page_content):
    lowered = page_content.lower()
    tokenizer = RegexpTokenizer(r'\w+')
    tokens = tokenizer.tokenize(lowered)
    stems = create_stems(tokens)

    return stems
Example #9
0
    def __init__(self, rtepair, stop=True, lemmatize=False):
        """
        @param rtepair: a L{RTEPair} from which features should be extracted
        @param stop: if C{True}, stopwords are thrown away.
        @type stop: C{bool}
        """
        self.stop = stop
        self.stopwords = set(
            ["a", "the", "it", "they", "of", "in", "to", "have", "is", "are", "were", "and", "very", ".", ","]
        )

        self.negwords = set(["no", "not", "never", "failed" "rejected", "denied"])
        # Try to tokenize so that abbreviations like U.S.and monetary amounts
        # like "$23.00" are kept as tokens.
        from nltk.tokenize import RegexpTokenizer

        tokenizer = RegexpTokenizer("([A-Z]\.)+|\w+|\$[\d\.]+")

        # Get the set of word types for text and hypothesis
        self.text_tokens = tokenizer.tokenize(rtepair.text)
        self.hyp_tokens = tokenizer.tokenize(rtepair.hyp)
        self.text_words = set(self.text_tokens)
        self.hyp_words = set(self.hyp_tokens)

        if lemmatize:
            self.text_words = set([lemmatize(token) for token in self.text_tokens])
            self.hyp_words = set([lemmatize(token) for token in self.hyp_tokens])

        if self.stop:
            self.text_words = self.text_words - self.stopwords
            self.hyp_words = self.hyp_words - self.stopwords

        self._overlap = self.hyp_words & self.text_words
        self._hyp_extra = self.hyp_words - self.text_words
        self._txt_extra = self.text_words - self.hyp_words
Example #10
0
def get_structuredsteps(soup, dct):
    dct['structuredsteps'] = []
    new_steps = dct['steps']
    new_ingredients = dct['ingredients']
    tokenizer = RegexpTokenizer(r'\w+')

    time_units = ['min', 'min.', 'minutes', 'minute', 'hour', 'hours', 'hr', 'hrs', 'hr.', 'hrs.']

    ingredient_names = []
    for y in new_ingredients:
        ingredient_names.append(y['name'])




    for step in new_steps:
        if step != '':
            method_list = []
            for method in methods:
                if method in step:
                    method_list.append(method)
                elif method + "ing" in step:
                    method_list.append(method)
                elif method + "s" == step:
                    method_list.append(method)
                elif method + "er" == step:
                    method_list.append(method)
                elif method + "ed" == step:
                    method_list.append(method)
                elif method + "ing" == step:
                    method_list.append(method)
            tools_list = []
            for tool in tools:
                if tool in step:
                    tools_list.append(tool)

            for verb in actions:
                if verb in step:
                    tools_list.append(actions[verb])
            ingredient_list = []
            for x in ingredient_names:
                for y in x.split():
                    if y in step:
                        ingredient_list.append(x)

            cooking_time = " "
            step_list = tokenizer.tokenize(step)

            for x in range(0,len(step_list)-2):
                if step_list[x].isdigit():
                    if step_list[x+1] in time_units:
                        cooking_time += step_list[x]+ ' ' + step_list[x+1] + ' '
            d = {
                'step': step,
                'tools': set(tools_list),
                'methods' : set(method_list),
                'cooking time': cooking_time,
                'ingredients' : ingredient_list
            }
            dct["structuredsteps"].append(d)
def Tokenize(TextData):
    tokenizer = RegexpTokenizer(r'\w+')
    tokens = list()

    # create English stop words list
    en_stop = get_stop_words('en')

    # Create p_stemmer of class PorterStemmer
    p_stemmer = PorterStemmer()

    # clean and tokenize document string
    raw = TextData.lower()
    tokens = tokenizer.tokenize(raw)

    # remove stop words from tokens
    stopped_tokens = [i for i in tokens if not i in en_stop]

    # stem tokens
    stemmed_tokens = [p_stemmer.stem(i) for i in stopped_tokens]
    tokens = stemmed_tokens

    TOKENIZEDTEXT_FILE = path.join(os.pardir, "Resources/TokenizedTextFiles/Personal-Narration/Unbroken - Motivational Video.txt")
    fp = open(TOKENIZEDTEXT_FILE, "w")
    print(TOKENIZEDTEXT_FILE)
    # pickle.dump(tokens, fp)
    fp.write(str(tokens))
    fp.close()
Example #12
0
def extractWords(text):
    tokenizer = RegexpTokenizer(r'\w+')
    words = tokenizer.tokenize(text)
    words = word_tokenize(text)
    sWords = stopwords.words("english")

    return [w.lower() for w in words if w not in sWords]
Example #13
0
def text_process(text):
    '''
    Takes in a string of text, then performs the following
    1. Tokenizes and removes punctuation
    2. Removes stopwords
    3. Stems
    4. Returns a list of the cleaned text
    '''
    if(pd.isnull(text)):
        return []
    
    # Tokenize 
    tokenizer = RegexpTokenizer(r'\w+')
    text_processed = tokenizer.tokenize(text)
    
    # Removing any stopwords
    text_processed = [word.lower() for word in text_processed if word.lower() not in stopwords.words('english')]
    
    # Stemming
    porterStemmer = PorterStemmer()
    
    text_processed = [porterStemmer.stem(word) for word in text_processed]
    
    try:
        text_processed.remove('b')
        
    except:
        pass
    
    return " ".join(text_processed)
Example #14
0
    def trainMarkovChain(self, n = 1):

        self.ngram_degree = n
      
        self.markov_model = defaultdict(lambda : defaultdict(int))

        sentences = self.corpus_sentences
        if sentences is None:
            sentences = self.sentenceTokenizeCorpus()

        print("Training markov model on corpus.")

        word_tokenizer = RegexpTokenizer(r"\w+")

        for sentence in sentences:
            words = word_tokenizer.tokenize(sentence)
            last_word_list = ["#"] * n

            for word in words:
                last_token = " ".join(last_word_list)
                
                self.markov_model[last_token][word] += 1
                
                last_word_list.append(word)
                last_word_list = last_word_list[1:]

            last_token = " ".join(last_word_list)
            self.markov_model[last_token]["#"] += 1
 def stripped_words(self, original_sentence):
     _sentence = filter(self.printable_char_filter, original_sentence)
     _sentence = _sentence.replace(u'\u2013', ' ')
     _sentence = _sentence.replace(u'\u2014', ' ')
     tokenizer = RegexpTokenizer('\w+|\$[\d\.]+|\S+')
     tokens = tokenizer.tokenize(_sentence)
     return [word.lower() for word in tokens if word.lower() not in stop_words]
Example #16
0
def write_summary(texts, ofile):
    word_tokenizer = RegexpTokenizer(r"\w+")
    with codecs.open(ofile, u"w", u"utf-8") as f:
        for text in texts:
            f.write(u" ".join([w.lower() for w in word_tokenizer.tokenize(text)]))
            f.write(u"\n")
            f.flush()
def relevance_features(doc):
	print "relfeatures"
	print doc[:10]
	features={}
	#print doc
	#Test 1 : Has synonyms of  NIT Warangal
	features['contains synonym']='false'
	for word in synonyms:
		if word in doc:
			features['contains synonym']='true'
			break

	#Test 2 : Has a person name that appears in Almabase's DB
	count=0
	names=ner.get_names(data)
	count=ner.query_db(names)
	print 'count is {}'.format(count)

	# if count==0:
	# 	features['hasAlumnus']='none'
	# elif count<=3:
	# 	features['hasAlumnus']='medium'
	# elif count>3:
	# 	features['hasAlumnus']='high'
	# print count

	#Test 3: Bag of words approach
	tokenizer = RegexpTokenizer(r'\w+')
	document_words=tokenizer.tokenize(doc)
	for word in word_features:
		if word.lower() in document_words:
			print "{} is present".format(word)
		features['contains({})'.format(word.lower())] = (word in document_words)
	return features
def issue_analysis(df):
    df_sub = df[['Issue']]
    df_sub.insert(0, 'count', 1)

    Issue_List=[]
    for i in range(0,50):
        Issue_List.append(df_sub.groupby(['Issue']).sum().sort_index(by='count', ascending=False).ix[i].name)

    tokenizer = RegexpTokenizer(r'[A-Za-z0-9\']+')    # set tokenize Reg
    en_stop = get_stop_words('en')         # create English stop words list
    p_stemmer = PorterStemmer()            # Create p_stemmer of class PorterStemmer
    texts = []                             # list for tokenized documents in loop
    text_view = ''
                                                                
    # loop through document list
    for i in Issue_List:
        # clean and tokenize document string
        raw = i.lower()
        tokens = tokenizer.tokenize(raw)
       
        # remove stop words from tokens
        stopped_tokens = [i for i in tokens if not i in en_stop]
        
        # stem tokens and add them to list
        stemmed_tokens = [p_stemmer.stem(i) for i in stopped_tokens]
        texts.append(stemmed_tokens)

        #print ' '.join(stemmed_tokens)
        text_view += ' '.join(stemmed_tokens)
        text_view += ' '

    wordcloud = WordCloud().generate(text_view)
    fig = plt.figure(figsize=(8,6))
    fig1 = fig.add_subplot(1,1,1)
    fig1.set_title("Top issued words", fontdict={'fontsize':25})
    fig1.imshow(wordcloud)
    fig1.axis("off")
    #plt.savefig('ComplainCount_WC.png')
    plt.savefig('ComplainCount_WC_2016.png')
    
    # turn our tokenized documents into a id <-> term dictionary
    dictionary = corpora.Dictionary(texts)

    # convert tokenized documents into a document-term matrix
    corpus = [dictionary.doc2bow(text) for text in texts]

    # generate LDA model
    ldamodel = gensim.models.ldamodel.LdaModel(corpus, num_topics=25, id2word = dictionary)
    LDAText =  ldamodel.print_topics(num_topics=5, num_words=3)
    #print "\n Topic analysis result for top 25 issues with LDA"
    #print(LDAText)
       
    vis_data = gensimvis.prepare(ldamodel, corpus, dictionary)
    #pyLDAvis.show(vis_data)
    #pyLDAvis.save_html(vis_data, "issue_lda.html")
    #pyLDAvis.save_json(vis_data, "issue_lda.json")
    pyLDAvis.save_html(vis_data, "issue_lda_2016.html")
    pyLDAvis.save_json(vis_data, "issue_lda_2016.json")

    return 0
def preprocess_wikidata(raw):
 # Initialize Tokenizer
    tokenizer = RegexpTokenizer(r'\w+')

    # Initialize Lemmatizer
    lemma = WordNetLemmatizer()

    # create English stop words list
    en_stop = get_stop_words('en')

    # Decode Wiki Markup entities and remove markup
    text = filter_wiki(raw)
    text = re.sub(filter_more, '', text)

    # clean and tokenize document string
    text = text.lower().split('../img/')[0]
    tokens = tokenizer.tokenize(text)

    # remove stop words from tokens
    tokens = [i for i in tokens if not i in en_stop]

    # stem tokens
    tokens = [lemma.lemmatize(i) for i in tokens]

    # remove non alphabetic characters
    tokens = [re.sub(r'[^a-z]', '', i) for i in tokens]

    # remove unigrams and bigrams
    tokens = [i for i in tokens if len(i)>2]

    return (tokens, text)
def get_product_vocab(dict_queries):
    tok = RegexpTokenizer(r'\w+')
    vocab = {}

    for query,v in dict_queries.items():
        words = defaultdict(int)

        for prod in v:
            w_prod = tok.tokenize(prod[1])
            for w in w_prod:
                #wt = stem(wt)
                if not re.match(r'\d+$', w) and \
                    len(w) > 1 and \
                    w not in stop_words: 
                    words[w] += 1

        vocab[query] = words.keys()
        #vocab[query] = [k for (k, v) in words.iteritems() if v > 1]

        """
        print "Query: " + query
        sorted_w = sorted(words.items(), key=lambda x:x[1], reverse=True)
        print sorted_w
        """
    
    return vocab
def preprocess(TWEETS, typeTweet):
    wordlist = []
    tokenizer = RegexpTokenizer(r'#?\w+') 
    #normalize text -- TOKENIZE USING REGEX TOKENIZER
    cnt = 0
    for item in TWEETS:
        text = TWEETS[cnt]
        tweet = ''.join(text)
        tweet = tweet.lower().strip('\n')
        
        tweet = re.sub(r'[0-9]+', "" , tweet)
        tweet = re.sub(r'@[^\s]+', "" , tweet)
        tweet = re.sub(r'#\w+primary', "" , tweet)                    
        wordlist.extend(tokenizer.tokenize(tweet))
        cnt += 1

    #remove stopwords
    stop = stopwords.words('english') + ['rt', 'via', 'u', 'r', 'b', '2', 'http', 
                                        'https', 'co', 'live', 'hall', 'town', 'watch', 
                                        'tune', 'time', 'tonight', 'today', 'campaign', 
                                        'debate', 'wants', 'without', 'dont', 
                                        '#hillaryclinton', '#berniesanders', '#donaldtrump', 
                                        '#tedcruz', "#johnkasich", '#politics']
    filtered = [term for term in wordlist if term not in stop] 
    filtered_final = [term for term in filtered if len(term)>3] 
    print 'Preprocessed %s tweets' % (typeTweet)
    return filtered_final
def getData():
    tokenizer = RegexpTokenizer(r'\w+')
    f = open("msr_paraphrase_train.txt", "r")
    f.readline()
    trainInput = []
    trainClass = [0] * 8160
    i = 0
    while i < 8160:
        tokens = f.readline().strip().split('\t')
        trainClass[i] = trainClass[i+1] = int(tokens[0])
        i += 2
        S = tokenizer.tokenize(tokens[3].lower())
        Smatrix1 = sentenceToMatrix(S)
        S = tokenizer.tokenize(tokens[4].lower())
        Smatrix2 = sentenceToMatrix(S)
        trainInput.append([np.transpose(Smatrix1+Smatrix2)])
        trainInput.append([np.transpose(Smatrix2+Smatrix1)])

    f.close()

    f = open("msr_paraphrase_test.txt", "r")
    f.readline()
    testInput = []
    testClass = [0] * 1725
    for i in range(0,1725):
        tokens = f.readline().strip().split('\t')
        testClass[i] = int(tokens[0])
        S = tokenizer.tokenize(tokens[3].lower())
        Smatrix = sentenceToMatrix(S)
        S = tokenizer.tokenize(tokens[4].lower())
        Smatrix.extend(sentenceToMatrix(S))
        testInput.append([np.transpose(Smatrix)])

    f.close()
    return trainInput, trainClass, testInput, testClass
Example #23
0
	def __init__(self, oldid, newid, data, general):
		self.newid=newid
		self.oldid=oldid
		self.data=data
		self.tfidfatt=[]
		self.tfidfval=[]
		self.freatt=[]
		self.freval=[]
		self.text=''
		self.ntlk=[]
		self.idfvalue=[]
		self.general=general

		tokenizer = RegexpTokenizer(r'\w+')
		#stemmer = SnowballStemmer("english")
		stemmer = PorterStemmer()

		stop = stopwords.words('english')
		for r in tokenizer.tokenize(data):
			a=0
			if r not in stop:
				if not any(i.isdigit() for i in r):
					r = stemmer.stem(r)
					if r not in self.ntlk:
						self.ntlk.append(r)
						self.text=self.text+' '+r
Example #24
0
	def parse_raw_data(self, new_art):
		self.startClass=default_timer()
		tokenizer = RegexpTokenizer(r'\w+')
		tokens = tokenizer.tokenize(new_art.body)
		stemmer = LancasterStemmer()
		article_dic = new_art.words
		global_dic = self.raw_dictionary

		for word in tokens:
			word = word.lower()
			if(False == self.is_stop_word(word) and word.isnumeric()==False):
				s_word = stemmer.stem(word)

			#	s_word = word
			## it is not a stop word, check if the word
			## is already part of the article dictionary.
			## if yes, increment the count else add it.
			## If you are adding check if it is part of
			## the big corpus, if yes increment the count
			## of number of articles with that word.
				self.globalWordCount+=1
				new_art.doc_len = new_art.doc_len + 1
				if(s_word in article_dic):
					article_dic[s_word].wrd_count+=1
					global_dic[s_word].wrd_count+=1
				else:
					article_dic[s_word] = local_word_attributes(1)

					if (s_word in global_dic):
						global_dic[s_word].art_count+=1
						global_dic[s_word].wrd_count+=1
					else:
						global_dic[s_word] = global_word_attributes(1,1, 1, 0)
Example #25
0
	def mean_stdDeviation(self,query,stopWordInstruction):
		list_count_postTitles = []
		list_postTitles = self.data[:][query].tolist()
		tokenizer = RegexpTokenizer(r'\w+')

		stopwords_mine = []
		#a.encode('ascii','ignore')
		stopwords_mine+= (word.encode('ascii','ignore') for word in stopwords.words('english'))
		tokenized_list = []
		new_list_tokenized = []
		for item in list_postTitles:
			tokenized_list.append(tokenizer.tokenize(item))
		
		if stopWordInstruction==True:
			for item in tokenized_list:
				temp = []
				temp += (word for word in item if word.lower() not in stopwords_mine)
				#print temp
				#raw_input()
				new_list_tokenized.append(temp)
		else:
			new_list_tokenized=copy.deepcopy(tokenized_list)
		


		for x in new_list_tokenized:
			list_count_postTitles.append(len(x))
		#print list_count_postTitles
		npArray = np.asarray(list_count_postTitles)
		print npArray.mean()
		print npArray.std()
		return [npArray.mean(),npArray.std(),list_postTitles,list_count_postTitles]
Example #26
0
def preprocess(sentence):
    sentence = sentence.lower()
    tokenizer = RegexpTokenizer(r'\w+')
    tokens = tokenizer.tokenize(sentence)
    filtered_words = [w for w in tokens if not w in stopwords.words('english')]
    #filtered_words = filter(lambda token: token not in stopwords.words('english'))
    return " ".join(filtered_words)
def count_ngrams(sessions,length):
    data = sessions
    data = data.replace(',',' ')
    

    tokenizer = RegexpTokenizer("[0-9]+")
    #include only number (pageIDs) for tokens
    token = tokenizer.tokenize(data)
    from nltk.util import ngrams
    #print list(ngrams(token, 2))

    generated_ngrams = list(ngrams(token,length))
    #print generated_ngrams
    try:
        ngrams = ' '.join(generated_ngrams[0])
    except IndexError:
        global non_list 
        non_list += 1
        #print 'Failed generated ngrams as there is no minimum '    
   # print ngrams
 
    for ngram in generated_ngrams:
        if not ngrams_statistics.has_key(ngram):
            ngrams_statistics.update({ngram:1})
        else:
            ngram_occurrences = ngrams_statistics[ngram]
            ngrams_statistics.update({ngram:ngram_occurrences+1})      
Example #28
0
 def run(self, data):
     results = []
     tokenizer = RegexpTokenizer(r'((?<=[^\w\s])\w(?=[^\w\s])|(\W))+', gaps=True)
     for corpus in data:
         corpus.contents = " ".join(tokenizer.tokenize(corpus.contents))
         results.append(corpus)
     return results
Example #29
0
    def tokenize(self, doc):
        '''
        use NLTK RegexpTokenizer
        '''

        tokenizer = RegexpTokenizer("\w{3,}")
        return [self.stemmer.stem(x) for x in tokenizer.tokenize(doc)]
Example #30
0
    def parse_questions(self):
        stemmer = PorterStemmer()
        tokenizer = RegexpTokenizer(r'\w+')
        for questions_key in self.rawSamples:
            # Stem the Question Text
            question_text = self.rawSamples[questions_key][0]
            words_array = tokenizer.tokenize(question_text)
            question_text = ""
            for word in words_array:
                if word.isnumeric():
                    continue
                if word not in text.ENGLISH_STOP_WORDS:
                    word = stemmer.stem(word)
                word = stemmer.stem(word)
                question_text += (word + " ")
            self.rawSamples[questions_key][0] = question_text

            # Stem the topic names
            topics_text = self.rawSamples[questions_key][2]
            words_array = tokenizer.tokenize(topics_text)
            topics_text = ""
            for word in words_array:
                if word.isnumeric():
                    continue
                if word not in text.ENGLISH_STOP_WORDS:
                    word = stemmer.stem(word)
                word = stemmer.stem(word)
                topics_text += (word + " ")
            self.rawSamples[questions_key][2] = topics_text
Example #31
0
Mithilesh
Ganesh Shinde
Sachidanand Tripathi
@author: Ganesh
"""

import nltk
# from nltk import word_tokenize
from nltk.util import ngrams
from nltk.corpus import stopwords
from nltk.tokenize import RegexpTokenizer

with open(r"InputDataForNgram") as inputData:
    data = inputData.read()

dataWithoutPun = RegexpTokenizer(r'\w+')
token = dataWithoutPun.tokenize(data)


def ngramfunction(token, number):
    totCount = len(token)

    ngramlist = ngrams(token, number)
    fdist = nltk.FreqDist(ngramlist)

    # FreqUnigramData = list(fdist.keys())
    # unigramResult = FreqUnigramData[0:100]

    # print(unigramResult)
    # fdist.plot(50,cumulative=False)
Example #32
0
# STOPWORDS = maketrie(stopwords.words('english'))
# STOPWORDS = set(stopwords.words('english'))
CORPUSDIR = './presidential_debates'
# DFTS is a dict('token': int(doc_freq)} that maps tokens to the qty of docs
# containing that term (ie NOT IDF, just the qty of docs containing the token)
DFTS = Counter()  # PriorityQueue()
DOCS = {}
FILENAMES = ''
IDFS = Counter()
N = float(0)
STEMMER = PorterStemmer().stem
STOP = 'STOP'
STOPWORDS = stopwords.words('english')
# TFIDFS = Counter()
TFIDFS = defaultdict(lambda: Counter())
TOKENIZER = RegexpTokenizer(r'[a-zA-Z]+').tokenize
# Can't DFTS.keys() be used instead of TOKENCORPUS??
# TOKENCORPUS =


def setup():
    global FILENAMES
    global IDFS
    global N
    FILENAMES = listdir(CORPUSDIR)
    N = float(len(FILENAMES))
    t = time()
    with Pool() as p:
        DOCS = dict(p.map(process_document, FILENAMES))
        for tokens in DOCS.values():
            # TOKENCORPUS.update(tokens)
        evaluation['#2 Rating'].append(0)
        evaluation['#3 Simplification'].append(unicode(baseline_result[key], 'ascii', errors='ignore'))
        evaluation['#3 Rating'].append(0)
        evaluation['#4 Simplification'].append(unicode(clustered_distances[key], 'ascii', errors='ignore'))
        evaluation['#4 Rating'].append(0)
        num_eval_sentences = num_eval_sentences + 1
        indeces_eval_sentences.append(random)

    
eval_df = pd.DataFrame(evaluation, index=[i for i in range(0, len(evaluation['Annotated Sentence']))], columns=['Annotated Sentence', '#1 Simplification', '#1 Rating', '#2 Simplification', '#2 Rating', '#3 Simplification', '#3 Rating', '#4 Simplification', '#4 Rating'])
eval_df.to_html(output_eval_dir, index=False, escape=False)

"""


tokenizer = RegexpTokenizer(r'\w+')
for i in range(0, len(dictionary)):
    flag = False
    for original in output['Annotated Sentence']:
        if tokenizer.tokenize(dictionary[i]['value']) == tokenizer.tokenize(get_original(original)):
            dictionary[i]['simplification'] = result[original][score[original].index(max(score[original]))]
            flag = True
            break
    if flag is False:
        print dictionary[i]['annotated_sentence']


xml_dict = []
for i in range(0 , len(dictionary)):
    xml_dict.append({'annotated_sentence': dictionary[i]['annotated_sentence'], \
                     'value': dictionary[i]['value'], \
Example #34
0
# coding=utf-8
import unidecode
import inflection
import re
from nltk.corpus import stopwords
stop = stopwords.words('english')
from nltk.tokenize import RegexpTokenizer
tokenizer = RegexpTokenizer(r'\w+')
from nltk.stem.porter import PorterStemmer
from nltk.stem import WordNetLemmatizer
lemmatizer = WordNetLemmatizer()


def change_alphabet(sent):
    return unidecode.unidecode(sent.decode('utf-8'))


def clean_sent(sent):
    sent = re.sub(r"http\S+", "", sent.lower()).decode('utf-8')
    sent = re.sub(r"@\S+", "", sent.lower()).decode('utf-8')
    #words=sent.split(" ")
    words = tokenizer.tokenize(sent)
    words_refined = [
        lemmatizer.lemmatize(inflection.singularize(word)) for word in words
    ]
    words = [
        inflection.transliterate(word.decode('utf-8'))
        for word in words_refined if not word.isdigit() and len(word) > 2
    ]
    p_stemmer = PorterStemmer()
    _digits = re.compile('\d')
def init():
	ps = PorterStemmer()
	tokenizer = RegexpTokenizer('\w+|\$[\d\.]+|\S+')
	return ps,tokenizer
Example #36
0
def my_tokenizer(text: str):
    """ Return tokens, remove punctuations as well"""
    tokenizer = RegexpTokenizer(r'\w+')
    return tokenizer.tokenize(text)
Example #37
0

def process_word(cont):
    c = []
    for i in cont:
        i = i.lower()
        clean_tweet = re.sub(r"http\S+", "", i)
        tokenizer = RegexpTokenizer('\w+|\$[\d\.]+|\S+')
        clean_tweet = tokenizer.tokenize(clean_tweet)
        a = list(clean_tweet)
        b = " ".join(a)
        c.append(b)
    return c


tokenizer = RegexpTokenizer('\w+|\$[\d\.]+|\S+')
train = readtrain()
data_train = process_word(train[1])
all_data = []


def cout_word():
    result = {}
    for content in data_train:
        for word in content.split():
            if word not in result:
                result[word] = 0
            result[word] += 1
    return result

reasonsList = list(reasons["Response"])
suggestionsList = list(suggestions["Response"])

#########################################################################################
#Part-of-Speech (POS) tag the words, then lemmatize (make root word) them               #
#########################################################################################
"""We may not actually want to lemmatize, since we will be making bigrams. Check """

from nltk import pos_tag
from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet
from nltk.tokenize import RegexpTokenizer

lemmatizer = WordNetLemmatizer()
tokenizer = RegexpTokenizer(r'\w+')


#Function to convert nltk tag to wordnet tag
def nltk_tag_to_wordnet_tag(nltk_tag):
    if nltk_tag.startswith('J'):
        return wordnet.ADJ
    elif nltk_tag.startswith('V'):
        return wordnet.VERB
    elif nltk_tag.startswith('N'):
        return wordnet.NOUN
    elif nltk_tag.startswith('R'):
        return wordnet.ADV
    else:
        return None
class Text_Preprocessing():
    def __init__(self, doc_map):
        self.posting_list = {}
        self.mine = ['br','\'','http','url','web','www','blp','ref','external','links']
        self.stop_words = set(stopwords.words('english')).union(self.mine)
        # self.ps = PorterStemmer().stem
        self.ps = SnowballStemmer("english").stem

        self.tokenizer = RegexpTokenizer(r'[a-zA-Z]+|[0-9]{,4}')
        self.d = doc_map
        self.sent = nltk.data.load('tokenizers/punkt/english.pickle').tokenize         
        self.toktok = ToktokTokenizer()
    
    def check(self, t1, t2, t3):
        
        if t1 not in self.posting_list:
            self.posting_list[t1] = {}
    
        if t2 not in self.posting_list[t1]:
            self.posting_list[t1][t2] = {}
            
        if t3 not in self.posting_list[t1][t2]:
            self.posting_list[t1][t2][t3] = 0
        return self.posting_list
    
    def process_title(self, text, pageNumber):
        

        token_list = self.tokenizer.tokenize(text.lower())    
        token_list = list(filter(None, token_list)) 

        filtered_sentence = [w for w in token_list if not w in self.stop_words]
        stemmed_list = [self.ps(word) for word in filtered_sentence if len(word)<11]
        stemmed_list = list(filter(None, stemmed_list))         
        # print('stemmedList title: ',stemmed_list)
        for word in stemmed_list:
            
            self.posting_list = self.check(word, pageNumber, 't')
            self.posting_list = self.check(word, pageNumber, 'n')
            self.posting_list[word][pageNumber]['t'] += 1
            self.posting_list[word][pageNumber]['n'] += 1
  
    def process_categories(self,text, pageNumber):
        c = 0
        category_regex = compile(".*\[\[Category:(.*?)\]\].*")
        match_cat_list = category_regex.findall(text)
        total_stems = []
        n = len('category') + 4
        total_stems = []
        rem = '[[Category:%s]]'
        extend = total_stems.extend
        for one_match in match_cat_list[:4]:
        
            text = text.replace(rem%(one_match), '')
            category_name = one_match[n:-3] # say, Indian Culture
            category_name = category_name.lower()
            token_list = self.tokenizer.tokenize(category_name)
            token_list = list(filter(None, token_list)) 

            filtered_sentence = [w for w in token_list if not w in self.stop_words]
            stemmed_list = [self.ps(word) for word in filtered_sentence if len(word)<11]
            extend(stemmed_list)
            
        
        for word in total_stems: # ['data', 'scienc', 'peopl', 'birth']
            # if word == '':
            #     print('here null category')
            self.posting_list = self.check(word, pageNumber, 'c')
            self.posting_list = self.check(word, pageNumber, 'n')
            self.posting_list[word][pageNumber]['c'] += 1
            self.posting_list[word][pageNumber]['n'] += 1
        
        return text
    
    def process_infobox(self, text, pageNumber):    

        infobox_start = compile("{{Infobox")

        start_match = search(infobox_start, text)
        if start_match:

            start_pos = start_match.start()
            brack_count = 2
            end_pos = start_pos + len("{{Infobox ")
            while(end_pos < len(text)):
                if text[end_pos] == '}':
                    brack_count = brack_count - 1
                if text[end_pos] == '{':
                    brack_count = brack_count + 1
                if brack_count == 0:
                    break
                end_pos = end_pos+1

            if end_pos+1 >= len(text):
                return
            infobox_string = text[start_pos:end_pos+1]  
            text = text.replace(infobox_string, '')
            content = infobox_string.split('\n')
            content = list(map(lambda x:x.lower(),content))
            tokens = []
            add = tokens.append
            heading = content[0][len('{{infobox '):-1]
            add(heading)
            for idx in range(1,len(content)-2):
                try:
                    value = " ".join(findall(r'\w+', content[idx].split('=',1)[1])).strip()
                    add(value)
                except:
                    pass
            tokens = list(filter(lambda x: x.strip(), tokens))
            tokens = list(filter(None, tokens)) 
            total_stems = []
            extend = total_stems.extend
            for one_token in tokens:
                token_list = self.tokenizer.tokenize(one_token)
                filtered_sentence = [w for w in token_list if not w in self.stop_words]
                stemmed_list = [self.ps(word) for word in filtered_sentence if len(word)<11]
                extend(stemmed_list)
            total_stems = list(filter(None, total_stems)) 
            for word in total_stems:
                # if word == '':
                #     print('here null ibox; ', total_stems)
                self.posting_list = self.check(word, pageNumber, 'i')
                self.posting_list = self.check(word, pageNumber, 'n')
                self.posting_list[word][pageNumber]['i'] += 1
                self.posting_list[word][pageNumber]['n'] += 1
        return text

    def process_ref(self, text, pageNumber):
#             pass
            ref_start = compile('< ref.* >(.*?)< /ref >', DOTALL)
            title_start = compile('.*title =|.*title=')
            n=2
            tokenized_corpus = [ref_start.findall(sent) for sent in sent_tokenize(text) if len(ref_start.findall(sent))>0  ]
            tokenized_corpus = list(chain(*tokenized_corpus))
            if len(tokenized_corpus) > n:
                tokenized_corpus = tokenized_corpus[:n]
            total_stems = []
            extend = total_stems.extend
            # print('ref len %f'%len(tokenized_corpus))
            for match_list in tokenized_corpus:
                text = text.replace(match_list, '')
                pipe_tokens = match_list.split('|')
                for one_token in pipe_tokens:

                    if title_start.match(one_token):

                        title = one_token.split('=')[1]
                        token_list = self.tokenizer.tokenize(one_token)
                        filtered_sentence = [w.lower() for w in token_list if not w in self.stop_words]
                        stemmed_list = [self.ps(word) for word in filtered_sentence]
                        stemmed_list = list(filter(None, stemmed_list)) 
                        extend(stemmed_list)
            
            for word in total_stems:
                self.posting_list = self.check(word, pageNumber, 'r')
                self.posting_list = self.check(word, pageNumber, 'n')
                self.posting_list[word][pageNumber]['r'] += 1
                self.posting_list[word][pageNumber]['n'] += 1
    
    def process_body_text(self, text, pageNumber):
        
        body_ = compile(r'==(.*)==|{{(.*)}}|#(.*)|{{(.*)|{{(.*)|\|(.*)|\}\}|\*.*|!.*|\[\[|\]\]|;.*|&lt;.*&gt;.*&lt;/.*&gt;|<.*>.*</.*>|<.*>')
        matches = list(chain.from_iterable(body_.findall(text)))

        matches = list(filter(None, matches)) 
        # text = filter(lambda x: text.replace(x,''), matches )
        big_regex = compile('|'.join(map(escape, matches)))
        text = big_regex.sub('',text)
        
        
        content = text.splitlines()
        content = list(filter(lambda x: x.strip(), content))

        content = [" ".join(findall("[a-zA-Z]+", x)).strip() for x in content]
        content = list(filter(None, content)) 
        
        content = list(map(lambda x:x.lower(),content))
        
        total_stems = []
        extend = total_stems.extend
        if len(content)>200:
            for one_line in range(0,len(content),5):
                   
                token_list = word_tokenize(content[one_line])
                filtered_sentence = [w for w in token_list if not w in self.stop_words]
                stemmed_list = [self.ps(word) for word in filtered_sentence]
                extend(stemmed_list)
        else:
            for one_line in content:
                   
                token_list = word_tokenize(one_line)
                filtered_sentence = [w for w in token_list if not w in self.stop_words]
                stemmed_list = [self.ps(word) for word in filtered_sentence]
                extend(stemmed_list)
        
        for word in total_stems:
            # if word == '':
                # print('here null boy')
            self.posting_list = self.check(word, pageNumber, 'b')
            self.posting_list = self.check(word, pageNumber, 'n')
            self.posting_list[word][pageNumber]['b'] += 1
            self.posting_list[word][pageNumber]['n'] += 1
        return text

#     def process_ref(self, text, pageNumber):
# #             pass
#             ref_regex = compile('.*< ref (.*?)< /ref >.*',DOTALL)
#             ref_tag = ref_regex.findall(text)
#             i = 0
            
#                 # title_start = compile('(.*?)title =|(.*?)title=')
#             for r in ref_tag:

#                 try:
#                     i+=1
#                     if i==4:
#                         break
#                     text = text.replace('< ref '+r+'< /ref >', '')
#                     r = split(r'title',r)[1].split('|',1)[0].replace('=','').strip()
                    
#                     token_list = self.tokenizer.tokenize(r)

#                     filtered_sentence = [w.lower() for w in token_list if not w in self.stop_words]

#                     stemmed_list = [self.ps(word) for word in filtered_sentence if len(word)<11]
#                     extend(stemmed_list)

#                     for word in total_stems:
#                         self.posting_list = self.check(word, pageNumber, 'r')
#                         self.posting_list = self.check(word, pageNumber, 'n')
#                         self.posting_list[word][pageNumber]['r'] += 1
#                         self.posting_list[word][pageNumber]['n'] += 1
#                 except:
#                     pass
                    
        
#     # def ab_with_check(self,text):
#     #     for ch in ['\\','`','*','_','{','}','[',']','(',')','>','#','+','-','.','!','$','\'']:
#     #         if ch in text:
#     #             text = text.replace(ch,"\\"+ch)
    
#     def process_body_text(self, text, pageNumber):
        
#         body_ = compile('==.*==|\{\{.*\}\}|#.*|\{\{.*|\|.*|\}\}|\*.*|!.*|\[\[|\]\]|;.*|&lt;.*&gt;.*&lt;/.*&gt;|<.*>.*</.*>|<.*>')
#         matches = set(body_.findall(text))
#         # print(matches)
#         # text = text.replace(x,'') for x in matches
#         # print(text)

#         if matches:
            
#             for one_match in matches:
#                 # print('one_match: ',one_match)
#                 text = text.replace(one_match,'')
#                 # print('text: ',text)
        
#         # text = str(filter(lambda x: text.replace(x,''), matches ))
#         content = text.splitlines()
#         content =[" ".join(findall("[a-zA-Z]+", x)).strip().lower() for x in content]
#         # content = [x.strip() for x in content]

#         # content = [" ".join(findall("[a-zA-Z]+", x)).strip() for x in content]
#         content = list(filter(None, content)) 
#         # print(content)

#         # content = list(map(lambda x:x.lower(),content))
#         # # content = " ".join(content)
#         # # print(content)s
#         total_stems = []
#         extend = total_stems.extend
#         if len(content)>200:
            
#             for one_word in range(0,len(content),2):
                   
#                 token_list = self.tokenizer.tokenize(content[one_word])
#                 filtered_sentence = [w for w in token_list if not w in self.stop_words]
#                 stemmed_list = [self.ps(word) for word in filtered_sentence if len(word)<20]
#                 extend(stemmed_list)
#         else:
#             for one_word in content:
        
#                 token_list = self.tokenizer.tokenize(one_word)
#                 filtered_sentence = [w for w in token_list if not w in self.stop_words]
#                 stemmed_list = [self.ps(word) for word in filtered_sentence if len(word)<20]
#                 extend(stemmed_list)

#         for word in total_stems:
#             self.posting_list = self.check(word, pageNumber, 'b')
#             self.posting_list = self.check(word, pageNumber, 'n')
#             self.posting_list[word][pageNumber]['b'] += 1
#             self.posting_list[word][pageNumber]['n'] += 1
#         return text
    
    def make_index(self):
        limit_one_doc = 30/60000.0 # in sec
        # print('make index')
        title_regex = compile('.*?:')
        for k,v in self.d.items():
            
            t1,t2,t3,t4,t5=0,0,0,0,0
            t = time.time() 
            match_title = title_regex.match(v['title'])
            self.process_title(v['title'], v['id'])            
            t1= time.time()-t
            if not match_title:
                body = v['body']
                t = time.time()
                x = self.process_categories(body, v['id'])
                t2= time.time()-t
                t= time.time()
                x = self.process_infobox(x, v['id'])
                t3= time.time()-t
                if x is not None:
                    # t = time.time()
                    self.process_ref(x, v['id'])
                t4=0
                if x is not None:
                    t = time.time()
                    
                    x = self.process_body_text(x, v['id'])
                    t5= time.time()-t
            T = t1+t2+t3+t4+t5
            if T>=limit_one_doc:
                pass
                # print('id %s title %f cat %f infobox %f ref %f body %f' % (v['id'],t1,t2,t3,t4,t5))
                # print('--> T: %f limit: %f exceed: %f'%(T, limit_one_doc, T-limit_one_doc))
            # print(i,end=' ')
                    
        return

    def parse_posting_list(self, path2index):
        complete_index = dict(sorted(self.posting_list.items()))

        for term, posting_list in complete_index.items():
            # if s:
            #     print('term: ',term)
            # if term == '':
            #     print()
            one_line = ""
            one_line = term + "|"
            for doc_id, occurences in posting_list.items(): 
                one_line += str(doc_id) + "$"
                
                for field, count in occurences.items():
                    one_line += field + ":" + str(count) + "#"
                    
                one_line += "|"
            one_line += "\n"
            with open(path2index, 'a+') as i:
                i.write(one_line)
    #one line: 0|29$i:1#n:1#|61$i:1#n:1#|..