コード例 #1
0
    def categorize_input_query(self,input_query):
        query_category=OrderedDict([])

        input_query=self.replace_punctuation_in_query_string(input_query)
        phrasal_not_tokenizer = RegexpTokenizer(r'![\"]+(\w+[-]*(\w+)*(\s*)(\w)*)*[\"]')
        word_not_tokenizer = RegexpTokenizer(r'!(\w+[-]*(\w)*)')

        not_queries_set=set(word_not_tokenizer.tokenize(input_query))
        not_queries_set=not_queries_set.union(set(phrasal_not_tokenizer.tokenize(input_query)))
        string_copy=input_query
        string_copy = re.sub(r"\".*?\"", "", string_copy)
        string_copy = re.sub(r"!.*?(\s|$)", "", string_copy)

        modified_not_words=[]
        for words in not_queries_set:
            #removing the not words
            modified_not_words.append(words[1:])
        phrase_tokenizer = RegexpTokenizer(r'[\"]+(\w+[-]*(\w+)*(\s*)(\w)*)*[\"]')
        phrase_queries_set=set(phrase_tokenizer.tokenize(input_query))

        phrase_queries_set=phrase_queries_set.difference(set(modified_not_words))
        query_category["PHRASE"]=phrase_queries_set
        query_category["NOT"]=modified_not_words
        normal_words=string_copy.split()
        normal_word_set=set(normal_words )
        query_category["WORD"]=normal_word_set
        return query_category
コード例 #2
0
ファイル: nlp.py プロジェクト: gsi-upm/sematch
class HashtagMatch:

    def __init__(self, name_matcher):
        from nltk.tokenize import RegexpTokenizer
        self._name_matcher = name_matcher
        self._hashtag_extract = RegexpTokenizer('(#[A-Za-z][A-Za-z0-9-_]+)')
        self._at_extract = RegexpTokenizer('(@[A-Za-z][A-Za-z0-9-_]+)')

    def extract_hashtag(self, text):
        return self._hashtag_extract.tokenize(text)

    def extract_at(self, text):
        return self._at_extract.tokenize(text)

    def match(self, text):
        segs = [' '.join(seg) for seg in self.segment(text[1:])]
        entities = map(self._name_matcher.exact_match, segs)
        return [e for e in entities if e]

    def segment(self, text):
        n = len(text) - 1
        count = 2 ** n
        sequences = map(lambda x: bin(x)[2:].zfill(n), range(count))
        segmentations = []
        for s in sequences:
            segmentation = []
            begin = 0
            for i in range(n):
                end = i + 1
                if s[i] == '1':
                    segmentation.append(''.join(text[begin:end]))
                    begin = end
            segmentation.append(''.join(text[begin:end + 1]))
            segmentations.append(segmentation)
        return segmentations
コード例 #3
0
    def __init__(self, rtepair, stop=True, lemmatize=False):
        """
        :param rtepair: a ``RTEPair`` from which features should be extracted
        :param stop: if ``True``, stopwords are thrown away.
        :type stop: bool
        """
        self.stop = stop
        self.stopwords = set(['a', 'the', 'it', 'they', 'of', 'in', 'to', 'is',
                              'have', 'are', 'were', 'and', 'very', '.', ','])

        self.negwords = set(['no', 'not', 'never', 'failed', 'rejected',
                             'denied'])
        # Try to tokenize so that abbreviations, monetary amounts, email
        # addresses, URLs are single tokens.
        from nltk.tokenize import RegexpTokenizer
        tokenizer = RegexpTokenizer('([\w.@:/])+|\w+|\$[\d.]+')

        #Get the set of word types for text and hypothesis
        self.text_tokens = tokenizer.tokenize(rtepair.text)
        self.hyp_tokens = tokenizer.tokenize(rtepair.hyp)
        self.text_words = set(self.text_tokens)
        self.hyp_words = set(self.hyp_tokens)

        if lemmatize:
            self.text_words = set(lemmatize(token) for token in self.text_tokens)
            self.hyp_words = set(lemmatize(token) for token in self.hyp_tokens)

        if self.stop:
            self.text_words = self.text_words - self.stopwords
            self.hyp_words = self.hyp_words - self.stopwords

        self._overlap = self.hyp_words & self.text_words
        self._hyp_extra = self.hyp_words - self.text_words
        self._txt_extra = self.text_words - self.hyp_words
コード例 #4
0
ファイル: spelldiffer.py プロジェクト: agh-glk/spelldiffer
class StringSpellchecksFinder(object):
    """
    Compares two strings, finding words that been
    """
    def __init__(self, similarity=0.7):
        self.tokenizer = RegexpTokenizer('[\w-]+')
        self.similarity = similarity

    def find(self, text_before, text_after):
        """
        Finds all spellchecks tuple(mistake, correction) in the given text
        """
        spellchecks = []
        text_before_tokens = map(lambda x: x.lower(), self.tokenizer.tokenize(text_before))
        text_after_tokens = map(lambda x: x.lower(), self.tokenizer.tokenize(text_after))
        diff_matching = SequenceMatcher(None, text_before_tokens, text_after_tokens)
        for difference in filter(lambda x: x[0] == 'replace', diff_matching.get_opcodes()):
            sequence_before = text_before_tokens[difference[1]:difference[2]]
            sequence_after = text_after_tokens[difference[3]:difference[4]]
            spellchecks += self.find_best_match(sequence_before, sequence_after)
        return spellchecks

    def find_best_match(self, sequence_before, sequence_after):
        """
        Finds the best matching of elements pairs that are most probable pairs
        """
        pairs = []
        possibilities = map(lambda element1: map(lambda element2: (element1, element2, SequenceMatcher(None, element1, element2).ratio()) , sequence_after) , sequence_before)
        for possibility in possibilities:
            possibility = filter(lambda p: p[2] >= self.similarity, possibility)
            if possibility:
                possibility.sort(key=lambda p: p[2], reverse=True)
                pairs.append((possibility[0][0], possibility[0][1]))
        return pairs
コード例 #5
0
ファイル: LoadData.py プロジェクト: suket22/CS246
    def parse_questions(self):
        stemmer = PorterStemmer()
        tokenizer = RegexpTokenizer(r'\w+')
        for questions_key in self.rawSamples:
            # Stem the Question Text
            question_text = self.rawSamples[questions_key][0]
            words_array = tokenizer.tokenize(question_text)
            question_text = ""
            for word in words_array:
                if word.isnumeric():
                    continue
                if word not in text.ENGLISH_STOP_WORDS:
                    word = stemmer.stem(word)
                word = stemmer.stem(word)
                question_text += (word + " ")
            self.rawSamples[questions_key][0] = question_text

            # Stem the topic names
            topics_text = self.rawSamples[questions_key][2]
            words_array = tokenizer.tokenize(topics_text)
            topics_text = ""
            for word in words_array:
                if word.isnumeric():
                    continue
                if word not in text.ENGLISH_STOP_WORDS:
                    word = stemmer.stem(word)
                word = stemmer.stem(word)
                topics_text += (word + " ")
            self.rawSamples[questions_key][2] = topics_text
コード例 #6
0
ファイル: rte_classify.py プロジェクト: altaha/ArgoJsonRDBMS
    def __init__(self, rtepair, stop=True, lemmatize=False):
        """
        @param rtepair: a L{RTEPair} from which features should be extracted
        @param stop: if C{True}, stopwords are thrown away.
        @type stop: C{bool}
        """
        self.stop = stop
        self.stopwords = set(
            ["a", "the", "it", "they", "of", "in", "to", "have", "is", "are", "were", "and", "very", ".", ","]
        )

        self.negwords = set(["no", "not", "never", "failed" "rejected", "denied"])
        # Try to tokenize so that abbreviations like U.S.and monetary amounts
        # like "$23.00" are kept as tokens.
        from nltk.tokenize import RegexpTokenizer

        tokenizer = RegexpTokenizer("([A-Z]\.)+|\w+|\$[\d\.]+")

        # Get the set of word types for text and hypothesis
        self.text_tokens = tokenizer.tokenize(rtepair.text)
        self.hyp_tokens = tokenizer.tokenize(rtepair.hyp)
        self.text_words = set(self.text_tokens)
        self.hyp_words = set(self.hyp_tokens)

        if lemmatize:
            self.text_words = set([lemmatize(token) for token in self.text_tokens])
            self.hyp_words = set([lemmatize(token) for token in self.hyp_tokens])

        if self.stop:
            self.text_words = self.text_words - self.stopwords
            self.hyp_words = self.hyp_words - self.stopwords

        self._overlap = self.hyp_words & self.text_words
        self._hyp_extra = self.hyp_words - self.text_words
        self._txt_extra = self.text_words - self.hyp_words
コード例 #7
0
def getData():
    tokenizer = RegexpTokenizer(r'\w+')
    f = open("msr_paraphrase_train.txt", "r")
    f.readline()
    trainInput = []
    trainClass = [0] * 8160
    i = 0
    while i < 8160:
        tokens = f.readline().strip().split('\t')
        trainClass[i] = trainClass[i+1] = int(tokens[0])
        i += 2
        S = tokenizer.tokenize(tokens[3].lower())
        Smatrix1 = sentenceToMatrix(S)
        S = tokenizer.tokenize(tokens[4].lower())
        Smatrix2 = sentenceToMatrix(S)
        trainInput.append([np.transpose(Smatrix1+Smatrix2)])
        trainInput.append([np.transpose(Smatrix2+Smatrix1)])

    f.close()

    f = open("msr_paraphrase_test.txt", "r")
    f.readline()
    testInput = []
    testClass = [0] * 1725
    for i in range(0,1725):
        tokens = f.readline().strip().split('\t')
        testClass[i] = int(tokens[0])
        S = tokenizer.tokenize(tokens[3].lower())
        Smatrix = sentenceToMatrix(S)
        S = tokenizer.tokenize(tokens[4].lower())
        Smatrix.extend(sentenceToMatrix(S))
        testInput.append([np.transpose(Smatrix)])

    f.close()
    return trainInput, trainClass, testInput, testClass
コード例 #8
0
 def __init__(self, rtepair, stop=True, lemmatize=False):
     """
     @param rtepair: a L{RTEPair} from which features should be extracted
     @param stop: if C{True}, stopwords are thrown away.
     @type stop: C{bool}
     """
     self.stop = stop
     self.stopwords = set(['a', 'the', 'it', 'they', 'of', 'in', 'to',
                           'have', 'is', 'are', 'were', 'and', 'very', '.',','])
     
     self.negwords = set(['no', 'not', 'never', 'failed' 'rejected', 'denied'])
     # Try to tokenize so that abbreviations like U.S.and monetary amounts
     # like "$23.00" are kept as tokens.
     from nltk.tokenize import RegexpTokenizer
     tokenizer = RegexpTokenizer('([A-Z]\.)+|\w+|\$[\d\.]+')
     
     #Get the set of word types for text and hypothesis
     self.text_tokens = tokenizer.tokenize(rtepair.text)
     self.hyp_tokens = tokenizer.tokenize(rtepair.hyp)
     self.text_words = set(self.text_tokens)
     self.hyp_words = set(self.hyp_tokens)
     
     if lemmatize:
         self.text_words = set([lemmatize(token) for token in self.text_tokens])
         self.hyp_words = set([lemmatize(token) for token in self.hyp_tokens])
     
     if self.stop:
         self.text_words = self.text_words - self.stopwords
         self.hyp_words = self.hyp_words - self.stopwords
         
     self._overlap = self.hyp_words & self.text_words
     self._hyp_extra = self.hyp_words - self.text_words
     self._txt_extra = self.text_words - self.hyp_words
コード例 #9
0
ファイル: models.py プロジェクト: onebit1984/epidetect
 def get_outbreak_countries(disease=all):
     tokenizer = RegexpTokenizer(r'\w+|[^\w\s]+')
     
     countries = []
     
     if disease == all:
         for location in Location.objects.all():
             country = tokenizer.tokenize(location.name)
             country = country[len(country)-1]
             
             if country not in countries:
                 countries.append(str(country))
     else:
         for tweet in Tweet.objects.filter(disease_type__contains=disease):
             if tweet.location:
                 country = tokenizer.tokenize(tweet.location.name)
                 country = country[len(country)-1]
                 country_disease_count = [str(country), \
                 len(Tweet.objects.filter(disease_type__contains=disease, \
                 location_string__contains=country)), disease]
                 
                 if country_disease_count not in countries:
                     countries.append(country_disease_count)
                 
     return countries
コード例 #10
0
ファイル: ngramc.py プロジェクト: tonyqtian/sentence_checker
def demo():
#    from nltk.corpus import brown
#    from nltk.probability import LidstoneProbDist, WittenBellProbDist
#    estimator = lambda fdist, bins: LidstoneProbDist(fdist, 0.2)
#    estimator = lambda fdist, bins: WittenBellProbDist(fdist, 0.2)
    from nltk.tokenize import RegexpTokenizer
    tokenizer = RegexpTokenizer("[\w']+")
    lm = NgramcModel(5)
    print lm
    
    sent = "Like a bridge over troubled water, I will lay it down."
    print sent
    words = tokenizer.tokenize(sent)
    print "Entropy: ", lm.entropy(words)
    
    sent = "over twenty year and he"
    print sent
    words = tokenizer.tokenize(sent)
    print "Entropy: ", lm.entropy(words)
    
    sent = "over twenty years and he"
    print sent
    words = tokenizer.tokenize(sent)
    print "Entropy: ", lm.entropy(words)    

    print lm.getBetter(["men" ,"are" ,"imporant" ,"for" ,"the"], ["men" ,"are" ,"important" ,"for" ,"the"])
コード例 #11
0
ファイル: script.py プロジェクト: g31pranjal/git-analysis
def stopWordRemoval() :


	f = open('repos', 'r')
	strn = f.read()
	lst = strn.split('\n')

	i = 0
	while i < (len(lst) - 1) :
	
		name = lst[i].split("/")

		dummyFile = 'filteredData/' + name[1] + '/dummy.txt';
		dr = os.path.dirname(dummyFile)

		if not os.path.exists(dr) :
			os.makedirs(dr)

		ft = open('data/'+name[1]+'/title.txt')
		st = ft.read().lower()

		fd = open('data/'+name[1]+'/description.txt')
		sd = fd.read().lower()

		fc = open('data/'+name[1]+'/content.txt')
		sc = fc.read().lower()
		

		tokenizer = RegexpTokenizer(r'\w+')

		wordArrTitle = tokenizer.tokenize(st)
		wordArrDesc = tokenizer.tokenize(sd)
		wordArrData = tokenizer.tokenize(sc)

		filteredWordsTitle = [w for w in wordArrTitle if not w in stopwords.words('english')]
		filteredWordsDesc = [w for w in wordArrDesc if not w in stopwords.words('english')]
		filteredWordsData = [w for w in wordArrData if not w in stopwords.words('english')]

		wordnet_lem= WordNetLemmatizer()


		ftf = open('filteredData/'+name[1]+'/title.lst','w')
		for w in filteredWordsTitle:
			#print w
			ftf.write(wordnet_lem.lemmatize(w)+'\n')

		fdf = open('filteredData/'+name[1]+'/description.lst','w')
		for w in filteredWordsDesc:
			#print w
			fdf.write(wordnet_lem.lemmatize(w)+'\n')

		fcf = open('filteredData/'+name[1]+'/content.lst','w')
		for w in filteredWordsData:
			print w+'\n'
			fcf.write(wordnet_lem.lemmatize(w)+'\n')
		
		i=i+2
コード例 #12
0
def average_sentence_length(text):
    tokenizer = RegexpTokenizer(r' ([A-Z][^\.!?]*[\.!?])')
    sentences = tokenizer.tokenize(text)
    s = np.zeros(len(sentences))
    for inds, sentence in enumerate(sentences):
        tokenizer = RegexpTokenizer(r'\w+')
        tokens = tokenizer.tokenize(sentence)
        s[inds] = len(tokens)
    return s, np.mean(s), np.std(s)
コード例 #13
0
    def _generate_answer_question_pair(self, question, article, X_train_words, Y_train_words, max_seqlen, max_queslen):

        tokenizer = RegexpTokenizer(r'\w+')
        answer =  re.split(r'\t+', question)[1]
        question_txt = tokenizer.tokenize(question)[1:-2]
        ref = int(re.split(r'\t+', question)[-1]) - 1
        seq = tokenizer.tokenize(article[ref])[1:] + question_txt

        if len(seq) > max_seqlen:
            max_seqlen = len(seq)
        X_train_words.append(seq)
        Y_train_words.append(answer)
        return max_seqlen, max_queslen
コード例 #14
0
ファイル: extract_features.py プロジェクト: oowowaee/wwlns
def calculate_freqs(data, toExclude):
    # lemmatizer = WordNetLemmatizer()
    stopwords = nltk.corpus.stopwords.words("english")
    sents = nltk.tokenize.sent_tokenize(data)
    tokenizer = RegexpTokenizer(r"\w+\'?\w+")

    # tagged_sentences = [w for s in sents for w in nltk.pos_tag(word_tokenize(s))]
    # words = [lemmatizer.lemmatize(w[0].lower(), get_wordnet_pos(w[1])) for w in tagged_sentences] # if w.lower() not in stopwords]
    if toExclude:
        words = [w for s in sents for w in tokenizer.tokenize(s) if w.lower() not in stopwords]
    else:
        words = [w for s in sents for w in tokenizer.tokenize(s)]
    return words
コード例 #15
0
ファイル: cloud_stats.py プロジェクト: cderici/hazircevap
def parse_document(filename,query):
    myfile = codecs.open(filename,"r","utf-8")
    raw = myfile.read()
    sentences = sent_tokenize(raw)
    tokenizer = RegexpTokenizer(r'\w+') #tokenizer.tokenize(sentences[0])
    stop = stopwords.words('english')

    sents = [[token.lower() for token in tokenizer.tokenize(sentence) if
               not(token in stop or token.isdigit())] for sentence in sentences]

    query_t = [token for token in tokenizer.tokenize(query) if not(token in stop or token.isdigit())]
    cloud = " ".join(list(itertools.chain(*sents)))
    return cloud,query_t
コード例 #16
0
ファイル: read_mail.py プロジェクト: Dokhyam/Emails
def clean_data(data):
	punctuations = list(string.punctuation)
	data = data.replace("\n"," ").replace(":", " ").replace(",","").replace(".","").replace("'s","").replace("?","")
	stemmer = PorterStemmer()
	stemmer2 = SnowballStemmer('english')
	tokenizer = RegexpTokenizer(r'\w+')
	tokenizer.tokenize(data)
	ndata1 = list(mysplit(data))
	ndata1 = [[stemmer.stem(xi) for xi in y.split(" ")] for y in ndata1] 
	ndata1 = [[stemmer2.stem(xi) for xi in y] for y in ndata1]
	ndata = [x for x in ndata1 if not x == ":"]
	ndata = [filter(None, x) for x in ndata]
	ndata = [x for x in ndata if x != []]
	return ndata
コード例 #17
0
    def map(self): 
        mc=MongoClient('ec2-52-0-148-244.compute-1.amazonaws.com',27017)
        dbmc=mc.genid
        idoc=dbmc.gentable.find_one_and_update(filter={},update={ "$inc": { "score": 1 } },upsert=True);
        k=Key(self.bucket)
        y=stopwords.words('english')
        i=1
        strx=str(int(idoc['score']))
        strz=None
        filestring=""
        for line in sys.stdin:
 
            
            line = unicode(line, "utf-8","ignore")
            pattern = re.compile(r'\b(' + r'|'.join(y) + r')\b\s*')
            line = pattern.sub('', line)
            

            tokenizer = RegexpTokenizer(r'\w+')
            words=tokenizer.tokenize(line)
            strz=strx+'a'+str(i)
            k.key=strz
            filestring=line+'\n'
            k.set_contents_from_string(filestring)
            for word in words:
                word=word.encode(encoding='UTF-8',errors='ignore')
                
                print '%s\t%s' % (word.strip(), strz)
            i+=1
コード例 #18
0
def generate_stemmed_tokens(page_content):
    lowered = page_content.lower()
    tokenizer = RegexpTokenizer(r'\w+')
    tokens = tokenizer.tokenize(lowered)
    stems = create_stems(tokens)

    return stems
コード例 #19
0
ファイル: textual_features.py プロジェクト: gsi-upm/gsitk
	def createLDAModel(texts, n_topics, n_passes):
		"""Generates a LDA model from an array of texts
		"""
		tokenizer = RegexpTokenizer(r'\w+')
		#Create EN stop words list
		en_stop = get_stop_words('en')
		#Create p_stemmer of class PorterStemmer
		p_stemmer = PorterStemmer()

		texts_ = []

		# loop through document list
		for i in texts:
		    
		    # clean and tokenize document string
		    raw = i.lower()
		    tokens = tokenizer.tokenize(raw)
		    
		    # remove stop words from tokens
		    stopped_tokens = [i for i in tokens if not i in en_stop]
		    # stem tokens
		    stemmed_tokens = [p_stemmer.stem(i) for i in stopped_tokens]
		    # add tokens to list
		    texts_.append(stemmed_tokens)

		# turn our tokenized documents into a id <-> term dictionary
		dictionary = corpora.Dictionary(texts_)

		# convert tokenized documents into a document-term matrix
		corpus = [dictionary.doc2bow(text) for text in texts_]

		# generate LDA model
		ldamodel = gensim.models.ldamodel.LdaModel(corpus, num_topics=n_topics, id2word = dictionary, passes=n_passes)

		return(ldamodel)
コード例 #20
0
ファイル: movier.py プロジェクト: Hongtian22/Movier
    def tokenize(self, doc):
        '''
        use NLTK RegexpTokenizer
        '''

        tokenizer = RegexpTokenizer("\w{3,}")
        return [self.stemmer.stem(x) for x in tokenizer.tokenize(doc)]
コード例 #21
0
def lda(data):
	data = get_only_text(data)
	only_tweet = data
	length = len(only_tweet)
	length = min(20,length)
	for i in xrange(0,length):
		print i
		print only_tweet[i]
	return
	
	tokenizer = RegexpTokenizer(r'\w+')
	en_stop = get_stop_words('en')
	p_stemmer = PorterStemmer()

	length = len(only_tweet)
	length = min(20,length)
	total_texts = []
	for i in xrange(0,length):
		print only_tweet[i]
		print 
		to_lower = only_tweet[i].lower()
		tokens = tokenizer.tokenize(to_lower)
		stopped_tokens = [k for k in tokens if not k in en_stop]
		texts = [p_stemmer.stem(k) for k in stopped_tokens]
		total_texts.append(texts)

	dictionary = corpora.Dictionary(total_texts)
	corpus = [dictionary.doc2bow(text) for text in total_texts]

	ldamodel = gensim.models.ldamodel.LdaModel(corpus, num_topics=2, id2word = dictionary, passes=20)
	result =  ldamodel.print_topics(num_topics=2, num_words=1)
	for i in result:
		print i
コード例 #22
0
def Tokenize(TextData):
    tokenizer = RegexpTokenizer(r'\w+')
    tokens = list()

    # create English stop words list
    en_stop = get_stop_words('en')

    # Create p_stemmer of class PorterStemmer
    p_stemmer = PorterStemmer()

    # clean and tokenize document string
    raw = TextData.lower()
    tokens = tokenizer.tokenize(raw)

    # remove stop words from tokens
    stopped_tokens = [i for i in tokens if not i in en_stop]

    # stem tokens
    stemmed_tokens = [p_stemmer.stem(i) for i in stopped_tokens]
    tokens = stemmed_tokens

    TOKENIZEDTEXT_FILE = path.join(os.pardir, "Resources/TokenizedTextFiles/Personal-Narration/Unbroken - Motivational Video.txt")
    fp = open(TOKENIZEDTEXT_FILE, "w")
    print(TOKENIZEDTEXT_FILE)
    # pickle.dump(tokens, fp)
    fp.write(str(tokens))
    fp.close()
コード例 #23
0
 def run(self, data):
     results = []
     tokenizer = RegexpTokenizer(r'((?<=[^\w\s])\w(?=[^\w\s])|(\W))+', gaps=True)
     for corpus in data:
         corpus.contents = " ".join(tokenizer.tokenize(corpus.contents))
         results.append(corpus)
     return results
コード例 #24
0
ファイル: NaNoGenMo.py プロジェクト: iangonzalez/NaNoGenMo
    def trainMarkovChain(self, n = 1):

        self.ngram_degree = n
      
        self.markov_model = defaultdict(lambda : defaultdict(int))

        sentences = self.corpus_sentences
        if sentences is None:
            sentences = self.sentenceTokenizeCorpus()

        print("Training markov model on corpus.")

        word_tokenizer = RegexpTokenizer(r"\w+")

        for sentence in sentences:
            words = word_tokenizer.tokenize(sentence)
            last_word_list = ["#"] * n

            for word in words:
                last_token = " ".join(last_word_list)
                
                self.markov_model[last_token][word] += 1
                
                last_word_list.append(word)
                last_word_list = last_word_list[1:]

            last_token = " ".join(last_word_list)
            self.markov_model[last_token]["#"] += 1
コード例 #25
0
def text_process(text):
    '''
    Takes in a string of text, then performs the following
    1. Tokenizes and removes punctuation
    2. Removes stopwords
    3. Stems
    4. Returns a list of the cleaned text
    '''
    if(pd.isnull(text)):
        return []
    
    # Tokenize 
    tokenizer = RegexpTokenizer(r'\w+')
    text_processed = tokenizer.tokenize(text)
    
    # Removing any stopwords
    text_processed = [word.lower() for word in text_processed if word.lower() not in stopwords.words('english')]
    
    # Stemming
    porterStemmer = PorterStemmer()
    
    text_processed = [porterStemmer.stem(word) for word in text_processed]
    
    try:
        text_processed.remove('b')
        
    except:
        pass
    
    return " ".join(text_processed)
コード例 #26
0
ファイル: process.py プロジェクト: atantri/data_mining
	def parse_raw_data(self, new_art):
		self.startClass=default_timer()
		tokenizer = RegexpTokenizer(r'\w+')
		tokens = tokenizer.tokenize(new_art.body)
		stemmer = LancasterStemmer()
		article_dic = new_art.words
		global_dic = self.raw_dictionary

		for word in tokens:
			word = word.lower()
			if(False == self.is_stop_word(word) and word.isnumeric()==False):
				s_word = stemmer.stem(word)

			#	s_word = word
			## it is not a stop word, check if the word
			## is already part of the article dictionary.
			## if yes, increment the count else add it.
			## If you are adding check if it is part of
			## the big corpus, if yes increment the count
			## of number of articles with that word.
				self.globalWordCount+=1
				new_art.doc_len = new_art.doc_len + 1
				if(s_word in article_dic):
					article_dic[s_word].wrd_count+=1
					global_dic[s_word].wrd_count+=1
				else:
					article_dic[s_word] = local_word_attributes(1)

					if (s_word in global_dic):
						global_dic[s_word].art_count+=1
						global_dic[s_word].wrd_count+=1
					else:
						global_dic[s_word] = global_word_attributes(1,1, 1, 0)
コード例 #27
0
class Categorizer:
    def __init__(self,pathToModel,features):
        #initialize categorizer with model.
        self.tokenizer =  RegexpTokenizer('[A-Za-z]\w+')
        fp = open(pathToModel,"r")
        fpf = open(features,"r")
        model = pickle.load(fp)
        self.features = pickle.load(fpf)
        fp.close()
        fpf.close()
        self.classifierNB = model
        
    def classify(self,text):
        featureSet = self.naiveFeatures(text)
        #print featureSet
        labels = self.classifierNB.classify(featureSet)
        labelsProbDist = self.classifierNB.prob_classify(featureSet)
        return labels

        
    def naiveFeatures(self,vid,train=False):
        vidTokens =  self.tokenizer.tokenize(vid.lower().strip())
        vid = set(vidTokens)
        #print vid
        features = {}
        if train:
            for word in self.features:
                features[word] = (word in vid)
        else:
            for word in vid:
                features[word] = (word in self.features)
        return features
コード例 #28
0
ファイル: build_model_summaries.py プロジェクト: kedz/cuttsum
def write_summary(texts, ofile):
    word_tokenizer = RegexpTokenizer(r"\w+")
    with codecs.open(ofile, u"w", u"utf-8") as f:
        for text in texts:
            f.write(u" ".join([w.lower() for w in word_tokenizer.tokenize(text)]))
            f.write(u"\n")
            f.flush()
コード例 #29
0
ファイル: views.py プロジェクト: Omrigan/travelrec
def textToWordList(txt):
    p_stemmer = RussianStemmer()
    tokenizer = RegexpTokenizer(r'\w+')
    stop_w = [p_stemmer.stem(i) for i in get_stop_words('ru')]
    r = re.compile('^[а-я]+$')
    badword =[
        'дом',
        'город',
        "дорог",
        "час",
        "ноч",
        "слов",
        "утр",
        "стран",
        "пут",
        "путешеств",
        "мест",
        'нов',
        "друз",
        "добр"
    ]
    txt = txt.lower().replace("<br>", "\n")
    tokens = [p_stemmer.stem(i) for i in tokenizer.tokenize(txt)]
    tokens = [i for i in tokens if not i in stop_w and r.match(i) and not i in badword]
    return tokens
コード例 #30
0
def preprocess(sentence):
    sentence = sentence.lower()
    tokenizer = RegexpTokenizer(r'\w+')
    tokens = tokenizer.tokenize(sentence)
    filtered_words = [w for w in tokens if not w in stopwords.words('english')]
    #filtered_words = filter(lambda token: token not in stopwords.words('english'))
    return " ".join(filtered_words)
コード例 #31
0
#!/usr/bin/env python
import sys
import pickle
from nltk.tokenize import RegexpTokenizer

# xzcat dev-0/in.tsv.xz | python3 ./predict.py > dev-0/out.tsv

weights, word_to_index_mapping = pickle.load(open('model.pkl', 'rb'))
tokenizer = RegexpTokenizer(r'\b[^\d\W]+\b')

for line in sys.stdin:
    document = line.rstrip()
    terms = tokenizer.tokenize(document)

    y_p = weights[0]
    for term in terms:
        if term in word_to_index_mapping:
            y_p += weights[word_to_index_mapping[term]]

    print(y_p)
コード例 #32
0
ファイル: qa8.py プロジェクト: priya-padmanaban/NLP-QA-System
def get_answer(question, story):
    """
    :param question: dict
    :param story: dict
    :return: str


    question is a dictionary with keys:
        dep -- A list of dependency graphs for the question sentence.
        par -- A list of constituency parses for the question sentence.
        text -- The raw text of story.
        sid --  The story id.
        difficulty -- easy, medium, or hard
        type -- whether you need to use the 'sch' or 'story' versions
                of the .
        qid  --  The id of the question.


    story is a dictionary with keys:
        story_dep -- list of dependency graphs for each sentence of
                    the story version.
        sch_dep -- list of dependency graphs for each sentence of
                    the sch version.
        sch_par -- list of constituency parses for each sentence of
                    the sch version.
        story_par -- list of constituency parses for each sentence of
                    the story version.
        sch --  the raw text for the sch version.
        text -- the raw text for the story version.
        sid --  the story id


    """
    ###     Your Code Goes Here         ###
    # Our tools

    stemmer = SnowballStemmer("english")
    chunker = nltk.RegexpParser(GRAMMAR)
    lmtzr = WordNetLemmatizer()

    driver = QABase()

    # question["qid"] returns the form: "fables-04-7"
    q = driver.get_question(question["qid"])
    current_story = driver.get_story(q["sid"])

    #############################################
    # if question["qid"] == 'blogs-03-1':
    #     print(question["text"])
    #     print(sent_tokenized_text[0])
    #     print("++++++++++++++++++++++++++++++++++++++++++++++")
    ############################################

    stopwords = set(nltk.corpus.stopwords.words("english"))


    if (question["difficulty"] == 'Easy'):



        if question["type"] != 'Story':
            sentences = get_sentences(current_story["sch"])
            text = story["sch"]
            text = nltk.sent_tokenize(text)

        else:
            sentences = get_sentences(current_story["text"])
            text = story["text"]
            text = nltk.sent_tokenize(text)

        Q = nltk.word_tokenize(question["text"].lower())
        # print(Q)

        all_stemmed_sentences = []
        for sent in sentences:
            temp_sent = []
            for w, pos in sent:
                temp_sent.append((stemmer.stem(w), pos))
            all_stemmed_sentences.append(temp_sent)
        stop_words = set(nltk.corpus.stopwords.words("english"))
        qbow = get_bow(get_sentences(question["text"])[0], stopwords)
        stemmed_qbow = []
        for w in qbow:
            stemmed_qbow.append(stemmer.stem(w))
        stemmed_qbow = set(stemmed_qbow)
        best_idx = best_overlap_index(stemmed_qbow, all_stemmed_sentences, stop_words, question)
        # print(question["qid"], best_idx)

        # tokenize questions, also removing punctuations to extract keywords
        tokenizer = RegexpTokenizer(r'\w+')
        tokenized_question_text = tokenizer.tokenize(question["text"])
        tagged_tokenized_question_text = nltk.pos_tag(tokenized_question_text)

        # remove stopwords
        tagged_keywords_list = []

        for word, tag in tagged_tokenized_question_text:
            if word not in stopwords:
                tagged_keywords_list.append((word, tag))

        # lemmatize keywords
        lemmatized_keywords_list = []
        for keyword, tag in tagged_keywords_list:
            lemmatized_keywords_list.append(stemmer.stem(keyword))

        #####################################################
        # if question["qid"] == 'fables-04-6':
        #     print("text:", text)
        #     print("best index:", best_idx)
        #     print("qid:", question["qid"])
        #     print(text[best_idx])
        #     print("==============================")
        #     print(get_sentences("".join(text)))
        #####################################################


        best_sent = get_sentences(text[best_idx])

        # Find the sentences that have all of our keywords in them
        # Last time, 2nd arg is sentences = get_sentences(text) which returns tuple of each word
        target_sentences = find_sentences(lemmatized_keywords_list, best_sent)
        # Extract the candidate locations from these sentences
        candidates_forest = find_candidates(target_sentences, chunker, question["text"])

        if len(candidates_forest) == 0:
            answer = doBaseline(question, story)
        else:

            possible_answers_list = []

            # locations is a list of trees
            for candidate in candidates_forest:
                # candidate.draw()
                possible_answers_list.append(" ".join([token[0] for token in candidate.leaves()]))
            answer = " ".join(possible_answers_list)

            ###########################################
            # currently, possible_answer contains the actual needed answer,
            # plus some garbage words around it from chunking,
            # we might be able to filter this out SOMEHOW
            # possible_answer is a list of strings
            ###########################################


    elif question["difficulty"] == 'Medium':

        if question["type"] != 'Story':
            sentences = get_sentences(current_story["sch"])
        else:
            sentences = get_sentences(current_story["text"])

        Q = nltk.word_tokenize(question["text"].lower())
        # print(Q)

        all_stemmed_sentences = []
        for sent in sentences:
            temp_sent = []
            for w, pos in sent:
                temp_sent.append((stemmer.stem(w), pos))
            all_stemmed_sentences.append(temp_sent)
        stop_words = set(nltk.corpus.stopwords.words("english"))
        qbow = get_bow(get_sentences(question["text"])[0], stopwords)
        stemmed_qbow = []
        for w in qbow:
            stemmed_qbow.append(stemmer.stem(w))
        stemmed_qbow = set(stemmed_qbow)
        best_idx = best_overlap_index(stemmed_qbow, all_stemmed_sentences, stop_words, question)
        # print(question["qid"], best_idx)

        if question["type"] != 'Story':
            tree = current_story["sch_par"][best_idx]
        else:
            tree = current_story["story_par"][best_idx]

        #############################################
        # if question["qid"] == 'blogs-03-13':
        #     print(Q)
        #     print(tree)
        #     print("++++++++++++++++++++++++++++++++++++++++++++++")
        ############################################
        # print(tree)
        # Create our pattern

        #########################################
        # MAKE PATTERN FIT FOR TYPE OF QUESTION #
        #########################################
        # print(Q[0])
        if Q[0] == 'where' or Q[0] == 'when':
            pattern = nltk.ParentedTree.fromstring("(VP (*) (PP))")
        elif Q[0] == 'who':
            pattern = nltk.ParentedTree.fromstring("(NP)")
        elif Q[0] == 'what':
            pattern = nltk.ParentedTree.fromstring("(NP)")
        elif Q[0] == 'why':
            pattern = nltk.ParentedTree.fromstring("(SBAR)")
        elif Q[0] == 'how':
            pattern = nltk.ParentedTree.fromstring("(RB)")

        # don't know how to deal with 'did' questions
        elif Q[0] == 'did':
            pattern = nltk.ParentedTree.fromstring("(S)")

        subtree1 = pattern_matcher(pattern, tree)

        ############################################
        # if question["qid"] == 'blogs-03-13':
        #     print("subtree1")
        #     print(subtree1)
        ############################################
        if subtree1 == None:
            #######################################
            answer = doBaseline(question, story)
            # answer = "doBaseline"
            #######################################
        else:
            # create a new pattern to match a smaller subset of subtrees
            if Q[0] == 'where' or Q[0] == 'when':
                pattern = nltk.ParentedTree.fromstring("(VP)")
            elif Q[0] == 'who':
                pattern = nltk.ParentedTree.fromstring("(NP)")
            elif Q[0] == 'what':
                pattern = nltk.ParentedTree.fromstring("(NP)")
            elif Q[0] == 'why':
                pattern = nltk.ParentedTree.fromstring("(SBAR (IN) (S))")
            elif Q[0] == 'how':
                pattern = nltk.ParentedTree.fromstring("(RB)")

            # don't know how to deal with 'did' questions
            elif Q[0] == 'did':
                pattern = nltk.ParentedTree.fromstring("(S)")


            # Find and make the answer
            # print(subtree)
            subtree2 = pattern_matcher(pattern, subtree1)
            if subtree2 == None:
                #######################################
                answer = doBaseline(question, story)
                # answer = "doBaseline"
                #######################################
            else:
                answer = " ".join(subtree2.leaves())

            ############################################
            # if question["qid"] == 'mc500.train.18.18':
            #     print("subtree2")
            #     print(subtree2)
            ############################################
            # cheat for dealing with 'did' questions
            if Q[0] == 'did':
                answer = "yes"

    elif question["difficulty"] == 'Hard':

        answer = "h"


    elif question["difficulty"] == 'Discourse':

        answer = "h"


    else:
        #########################################
        answer = doBaseline(question, story)
        # answer = "doBaseline"
        #########################################

    ###     End of Your Code         ###
    return answer
コード例 #33
0
ファイル: sentiwn.py プロジェクト: ninap11/projekat-petnica
def tfidf(article, articles):



txt_file = open(r"C:\Users\Bratislav\Desktop\petnica projekat\data\train set.txt", "r+", encoding= "utf-8-sig")
article = txt_file.readlines()[7]
article = article[article.find(" ")+11:]

tokenizer = RegexpTokenizer(r'\w+')
corpus = nltk.corpus.stopwords.words('english')
stemmer = PorterStemmer()

sentences = nltk.sent_tokenize(article)
# print(words)
lemmatizer = WordNetLemmatizer()
# tagger = WordNetTagger()

syns = []
no_lemmas = []
lemmas = []
a = 0
b = 0
c = 0

pos = 0
neg = 0
obj = 0

for sentence in sentences:
    # print(sentence)
    pos1, neg1, obj1 = 0, 0, 0
    syns = []
    words = tokenizer.tokenize(sentence)
    no_stop = no_stopwords_func(words)
    # print(no_stop)
    
    # print(ws)
    tokens = nltk.pos_tag(no_stop)
    # print(token)
    for token in tokens:
        lemma = lemmatizer.lemmatize(token[0])
        stem = stemmer.stem(lemma)
        stemmed        
        
    

#     if obj1 == max(pos1, neg1, obj1):
#         print("a")
#         if obj1 == pos1:
#             pos += 1
#             print("p")
#         elif obj1 == neg1:
#             neg += 1
#             print("n")
#         else:
#             obj += 1
#     elif pos1 == max(pos1, neg1, obj1):
#         print("b")    
#         if pos1 == neg1:
#             obj += 1
#             print("o")
#         else:
#             pos +=1
#     elif neg1 == max(pos1, neg1, obj1):
#         print("c")
#         if neg1 == pos1:
#             obj += 1
#             print("o")
#         else:
#             neg += 1
#             print("n")

# print(pos, neg, obj)



# for synset in syns:
#     pos, neg, obj = 0, 0, 0
#     pos += synset.pos_score()
#     neg += synset.neg_score()
#     obj += synset.obj_score()
#     print(obj, pos, neg)
# print(pos, neg, obj)
# # print(syns)
# print(no_lemmas)
# print(a, b, c)
# # print(swn.senti_synset("Bad.n.01"))
コード例 #34
0
def essay_grader(f_name, data, topic):
    wrong = 0
    sent_count = 0
    sentences = sent_tokenize(data)
    result = ""

    for sentence in sentences:
        sent_count += 1

    from nltk.tokenize import word_tokenize
    from nltk.tokenize import RegexpTokenizer

    spelling_error = 0

    # Regexptokenizer is used for tokenizing effectively
    tokenizer = RegexpTokenizer('[A-Za-z0-9\']+')

    tk1 = tokenizer.tokenize(data)
    result = ""
    for token in tk1:
        result += "[" + token + "] "

########################### b. Spelling mistakes ##################################

    from nltk import pos_tag

    # Making use of two dictionaries using pyenchant to compare spellings
    d_US = enchant.Dict("en_US")
    d_UK = enchant.Dict("en_UK")
    tagged_tokens = pos_tag(tk1)
    result = ""
    spelling_error = 0
    serror = []

    # This is done to make sure that the proper noun is not considered as a spelling error
    crosscheck = ['NNP', 'NNPS']

    # Checking the spelling error for each word in the essay
    for token in tagged_tokens:
        result += '[' + token[0] + '/' + token[1] + ']'
        flag = 0
        for ind, tag_val in enumerate(crosscheck):
            if (token[1] == crosscheck[ind]):
                flag = 1
        if flag != 1:
            val_US = d_US.check(token[0])
            val_UK = d_UK.check(token[0])
            if (val_US == False and val_UK == False):
                serror.append(token[0])
                spelling_error += 1

########################## c.(i) Subject verb agreement ########################

# Here we check for the most common type of mistake which is the mistake of this and these
    gramm_mist = 0
    for k in sentences:
        tokens = word_tokenize(k)
        for i, j in enumerate(
                tokens):  #for a given set of tokens in a given sentence
            if j == 'this':
                list_temp = nltk.tag.pos_tag([tokens[i + 1]])
                for tag in list_temp:
                    if (tag[1] == 'NNS'):
                        gramm_mist += 1

            if j == 'these':
                list_temp1 = nltk.tag.pos_tag([tokens[i + 1]])
                for tag in list_temp1:
                    if (tag[1] == 'NN'):
                        gramm_mist += 1

    tokens = word_tokenize(data)
    result = ""
    for token in tokens:
        result += "[" + token + "] "

    from nltk import pos_tag
    tagged_tokens = pos_tag(tokens)
    result = ""

    # In this case we check for subject verb agreement using different pairs of tags
    # We are detecting whether the user has entered comma or not by using pairs of tags that cannot come together without a comma
    # We also found that two determiners cannot be together
    crosscheck = [
        'NNP VBP', 'MD VBN', 'DT DT', 'DT VBP', 'DT VB', 'DT PRP', 'MD VBD',
        'JJS PRP'
    ]

    previousTag = '.'
    previousWord = ''
    pairs_mtake = 0
    for token in tagged_tokens:
        result += '[' + token[0] + '/' + token[1] + ']'

        previousTag_tag = previousTag + ' ' + token[1]
        previousTag = token[1]

        previousWord_word = previousWord + ' ' + token[0]
        previousWord = token[0]

        # The bigram pos pairs are checked with the pairs in the crosschecked list

        flag = 0
        for ind, tag_val in enumerate(crosscheck):
            if (previousTag_tag == crosscheck[ind]):
                flag = 1
                pairs_mtake += 1

    pos_gramm_mistakes = pairs_mtake + gramm_mist
    pos_mist.append(pos_gramm_mistakes)

    ################# c.(ii) - Detecting missing verbs and tense mistakes ###########

    verb_mist = 0

    #individual sentences in the list
    for k in sentences:
        doc = nlp(k)
        str = ""
        #tokenize individual senteneces
        for token in doc:
            str = str + token.pos_ + " "

        if str.find("VERB") == -1:
            verb_mist += 1

    # In this case we check tense mistakes and missing verbs by making doing a crosscheck of pairs
    crosscheck = ['NNP VBP', 'NNS VBZ', 'VBZ NNP', 'VBP NNP']

    previousTag = '.'
    previousWord = ''
    tense_mtake = 0

    # Pairs of tokens are checked each time by making use of crosscheck array in order to find mistakes in pairs
    for token in tagged_tokens:
        result += '[' + token[0] + '/' + token[1] + ']'

        previousTag_tag = previousTag + ' ' + token[1]
        previousTag = token[1]

        previousWord_word = previousWord + ' ' + token[0]
        previousWord = token[0]

        flag = 0
        for ind, tag_val in enumerate(crosscheck):
            if (previousTag_tag == crosscheck[ind]):
                flag = 1
                tense_mtake += 1

    verb_tensemist = verb_mist + tense_mtake
    vb_mist.append(verb_tensemist)

    ########################## c.(iii) Sentence Formation ######################################

    error_frag = 0
    for k in sentences:
        output = nlp1.annotate(k,
                               properties={
                                   'annotators':
                                   'tokenize,ssplit,pos,depparse,parse',
                                   'outputFormat': 'json'
                               })

        sbar_flag = 0
        s_flag = 0
        if (count <= 83):
            for i, p in enumerate([
                    s['parse'] for s in output['sentences']
            ]):  #Returns a parse tree for a particular sentence
                index_s = p.find('(S')
                if (p[index_s + 2] == '\n' or p[index_s + 2] == ' '):
                    s_flag = 1

                index_sbar = p.find('SBAR')
                if (p[index_sbar + 4] == " " or p[index_sbar + 4] == "\n"):
                    sbar_flag = 1

                if "FRAG" in p:
                    if (sbar_flag == 1 and s_flag == 0):
                        #print(p)
                        error_frag += 1


############################ d.(i) Is the essay coherent? #######################################
    tokenizer = RegexpTokenizer('[A-Za-z0-9\']+')

    sentences = sent_tokenize(data)

    prev_sent = ""
    for ind, s in enumerate(sentences):

        if (ind != 0):
            tk1 = tokenizer.tokenize(s)
            tagged_tokens = pos_tag(tk1)
            for token in tagged_tokens:
                if (token[1] == 'PRP' or token[1]
                        == 'PRP$'):  #Looking for pronouns in 3rd person
                    if (token[0].casefold() not in list1
                            and token[0] not in f_list1):
                        f_list1.append(token[0])

            prev_sent = sentences[ind - 1]
            utterances = s  #Current sentence
            context = prev_sent  #Previous sentence for conflict resolution

            clusters = coref.one_shot_coref(utterances, context)

            most = coref.get_most_representative(
            )  #Generates links between context and utterance
            most1 = repr(most)
            for x in f_list1:
                if x not in most1:
                    #print("%s\n" %context)
                    #print("%s\n" %utterances)
                    #print("%s\n\n\n" %x)

                    wrong += 1
                    break
            f_list1.clear()
    c_list.append(wrong)
    coref_mist = wrong
    wrong = 0

    ############################ d.(ii) Does the essay stay on topic       #######################################
    tk1 = tokenize(topic)
    tagged_tokens = pos_tag(tk1)
    new_top = ''

    #Here I check for all the noun occurences in the essay
    for token in tagged_tokens:
        if (token[1] == 'NNS' or token[1] == 'NN' or token[1] == 'NNP'
                or token[1] == 'NNPS'):
            new_top = new_top + token[0] + " "

    nouns = {x.name().split('.', 1)[0] for x in wordnet.all_synsets('n')}

    dic = {}
    #I use wordnet to find the main words in the topic which are used later to find similar words in essay
    for i, k in enumerate(topic.split()):
        synonyms = []
        for syn in wordnet.synsets(k):
            for l in syn.lemmas():
                synonyms.append(l.name())
        dic.update({i: set(synonyms)})

    word_set = set()

    #Now using the synonyms of the words in the topics I find the match in the essay
    for k in data.split(' '):  #Each of the words in essay
        for val in dic:
            if (k in dic[val] and k in nouns):
                word_set.add(k)

    for i, k in enumerate(new_top.split()):
        synonyms = []
        for syn in wordnet.synsets(k):
            for l in syn.lemmas():
                synonyms.append(l.name())
        dic.update({i: set(synonyms)})

    for k in data.split(' '):  #Each of the words in essay
        for j in new_top.split(' '):
            if (k == j):
                word_set.add(k)

    new_set = set()
    for k in word_set:
        if (k != ''):
            new_set.add(lemmatizer.lemmatize(k))
    #print(len(new_set))

    #The length gives the number of words that are related to the topic
    ess_coher = len(new_set)
    coher.append(len(new_set))

    ################################################################################

    scores(f_name, sent_count, spelling_error, serror, pos_gramm_mistakes,
           verb_tensemist, error_frag, ess_coher, coref_mist)
コード例 #35
0
ファイル: jvalin_P1.py プロジェクト: jvalin17/Python-Programs
visited = []
#import ntpath
import os
import glob

os.chdir('stateoftheunionaddresses')
for fil in glob.glob("*.txt"):
    v.append(fil)

corpus_root = ('stateoftheunionaddresses')
for stateoftheunionaddresses in os.listdir(corpus_root):
    file = open(os.path.join(corpus_root, stateoftheunionaddresses), "r")
    doc = file.read()
    b = sorted((stopwords.words('english')))
    tokenizer = RegexpTokenizer(r'[a-zA-Z]+')
    tokens = tokenizer.tokenize(doc)

    tokens = [w for w in tokens if not w.lower() in b]

    #print(v[f])

    for p in range(len(tokens)):
        stemmer = PorterStemmer()
        tokens[p] = stemmer.stem(tokens[p])

    for j in range(len(tokens)):
        fx.append(tokens[j])

    jk.append(len(tokens))

    count = Counter(tokens)
コード例 #36
0
def loadData(filename):
    global uniqueWords, wordcodes, wordcounts
    override = True
    if override:
        #... for debugging purposes, reloading input file and tokenizing is quite slow
        #...  >> simply reload the completed objects. Instantaneous.
        fullrec = pickle.load(open("w2v_fullrec.p", "rb"))
        wordcodes = pickle.load(open("w2v_wordcodes.p", "rb"))
        uniqueWords = pickle.load(open("w2v_uniqueWords.p", "rb"))
        wordcounts = pickle.load(open("w2v_wordcounts.p", "rb"))
        logging.debug("len_unk: {} code: {}".format(wordcounts['UNK'],
                                                    wordcodes['UNK']))
        return fullrec

    #... load in the unlabeled data file. You can load in a subset for debugging purposes.
    handle = open(filename, "r", encoding="utf8")
    fullconts = handle.read().split("\n")
    fullconts = [
        entry.split("\t")[1].replace("<br />", "")
        for entry in fullconts[1:(len(fullconts) - 1)]
    ]

    #... apply simple tokenization (whitespace and lowercase)
    fullconts = [" ".join(fullconts).lower()]

    print("Generating token stream...")
    #... (TASK) populate fullrec as one-dimension array of all tokens in the order they appear.
    #... ignore stopwords in this process
    #... for simplicity, you may use nltk.word_tokenize() to split fullconts.
    #... keep track of the frequency counts of tokens in origcounts.
    stop_words = set(stopwords.words('english'))
    tokenizer = RegexpTokenizer(r'\w+')
    words = tokenizer.tokenize(fullconts[0])
    fullrec = list(filter(lambda x: x not in stop_words, words))
    logging.debug("fullrec: {}".format(fullrec[:100]))
    min_count = 50
    origcounts = Counter(fullrec)

    print("Performing minimum thresholding..")
    #... (TASK) populate array fullrec_filtered to include terms as-is that appeared at least min_count times
    #... replace other terms with <UNK> token.
    #... update frequency count of each token in dict wordcounts where: wordcounts[token] = freq(token)

    fullrec_filtered = list(
        map(lambda x: x if origcounts[x] >= min_count else 'UNK', fullrec))
    logging.debug("fullrec_filtered: {}".format(fullrec_filtered[:100]))

    #... after filling in fullrec_filtered, replace the original fullrec with this one.
    fullrec = fullrec_filtered
    wordcounts = Counter(fullrec)

    print("Producing one-hot indicies")
    #... (TASK) sort the unique tokens into array uniqueWords
    #... produce their one-hot indices in dict wordcodes where wordcodes[token] = onehot_index(token)
    #... replace all word tokens in fullrec with their corresponding one-hot indices.
    uniqueWords = list(set(fullrec_filtered))
    wordcodes = {w: i for i, w in enumerate(uniqueWords)}
    #logging.debug("wordcodes: {}".format(wordcodes))
    logging.debug("len_unk: {} code: {}".format(wordcounts['UNK'],
                                                wordcodes['UNK']))
    fullrec = list(map(lambda x: wordcodes[x], fullrec))
    #logging.debug("fullrec to indices: {}".format(fullrec))

    #... close input file handle
    handle.close()

    #... store these objects for later.
    #... for debugging, don't keep re-tokenizing same data in same way.
    #... just reload the already-processed input data with pickles.
    #... NOTE: you have to reload data from scratch if you change the min_count, tokenization or number of input rows

    pickle.dump(fullrec, open("w2v_fullrec.p", "wb+"))
    pickle.dump(wordcodes, open("w2v_wordcodes.p", "wb+"))
    pickle.dump(uniqueWords, open("w2v_uniqueWords.p", "wb+"))
    pickle.dump(dict(wordcounts), open("w2v_wordcounts.p", "wb+"))

    #... output fullrec should be sequence of tokens, each represented as their one-hot index from wordcodes.
    return fullrec
コード例 #37
0
    def getfeature(self, tweet):
        text = tweet["text"]
        feature = []
        words = nltk.word_tokenize(text)

        tokenizer = RegexpTokenizer(r'\w+')
        word_nopunc = tokenizer.tokenize(text)
        word_nopunc = [i for i in word_nopunc if i not in stop]

        # top 20 features using word2vec
        for i in word_nopunc:
            if i in model.wv:
                feat_list = model.wv[i].tolist()
                feature.extend(feat_list[:20])

        #append 0 if no feature found
        if (len(feature) < 100):
            for i in range(len(feature), 101):
                feature.append(0)
        feature = feature[:100]

        # Has question marks
        if text.find('?') > 0:
            feature.append(1)
        else:
            feature.append(0)

        # has !
        if text.find('!') > 0:
            feature.append(1)
        else:
            feature.append(0)

        # has hastag
        if (len(tweet['entities']['hashtags']) > 0):
            # feature.append(len(tweet['entities']['hashtags']))
            feature.append(1)
        else:
            feature.append(0)

        # has usermention
        if (len(tweet['entities']['user_mentions']) > 0):
            # feature.append(len(tweet['entities']['user_mentions']))
            feature.append(1)
        else:
            feature.append(0)

        # has url
        if (len(tweet['entities']['urls']) > 0):
            # feature.append(len(tweet['entities']['urls']))
            feature.append(1)
        else:
            feature.append(0)

        # has media
        if ('media' in tweet['entities']):
            # feature.append(len(tweet['entities']['media']))
            feature.append(1)
        else:
            feature.append(0)

        # sentiment analysis
        clean_tweet = ' '.join(
            re.sub("(@[A-Za-z0-9]+)|([^0-9A-Za-z \t])|(\w+:\/\/\S+)", " ",
                   text).split())
        analysis = TextBlob(clean_tweet)

        if analysis.sentiment.polarity > 0:
            feature.append(1)
        else:
            feature.append(0)

        # # has poll
        # if ('polls' in tweet['entities']):
        # 	# feature.append(len(tweet['entities']['media']))
        # 	feature.append(1)
        # else:
        # feature.append(0)

        # Likes
        # if ((tweet['favorite_count']) > 0):
        # 	# feature.append(len(tweet['entities']['media']))
        # 	feature.append((tweet['favorite_count']))
        # else:
        # 	feature.append(0)

        # # Retweets
        # if ((tweet['retweet_count']) > 0):
        # 	# feature.append(len(tweet['entities']['media']))
        # 	feature.append((tweet['retweet_count']))
        # else:
        # 	feature.append(0)

        #	favourited
        # if ('favourited' in tweet and tweet['favourited']):
        # 	feature.append(1)
        # else:
        # 	feature.append(0)

        # #	Retweeted
        # if ('retweeted' in tweet and tweet['retweeted']):
        # 	feature.append(1)
        # else:
        # 	feature.append(0)
        # is source
        # if (source == )

        # Capital to lower case ratio
        uppers = [l for l in text if l.isupper()]
        capitalratio = len(uppers) / len(text)
        feature.append(capitalratio)

        count_punct = 0
        # negative words list
        neg_words = [
            "not", "no", "nobody", "none", "never", "neither", "nor",
            "nowhere", "hardly", "scarcely", "barely", "don't", "isn't",
            "wasn't", "shouldn't", "wouldn't", "couldn't", "doesn't"
        ]

        count_neg_words = 0
        # count number of punctuations and negative words
        for i in words:
            if (i in (string.punctuation)):
                count_punct += 1
            if (i in neg_words):
                count_neg_words += 1

        feature.append(count_punct)
        feature.append(count_neg_words)
        swearwords = []
        with open('badwords.txt', 'r') as f:
            for line in f:
                swearwords.append(line.strip().lower())

        hasswearwords = 0
        for token in word_nopunc:
            if token in swearwords:
                hasswearwords += 1
        feature.append(hasswearwords)

        return feature
コード例 #38
0
def visualize(e_visualization, s_visualization, file_name):
    file_handler = open(file_name,"r").read()

    toker = RegexpTokenizer(r'\w+')
    words = toker.tokenize(file_handler)

    allowed_types = ["JJ", "JJR", "JJS", "NN", "NNS", "RB", "RBR", "RBS", "VB", "VBD", "VBG", "VBN", "VBP", "VBZ"]

    filtered_words = []

    #stopwords removal
    for w in words:
        if w not in stop_words:
            filtered_words.append(w)

    pos = nltk.pos_tag(filtered_words)
    allowed_words = []
    #print(allowed_words)

    for p in pos:
        if p[1] in allowed_types:
            allowed_words.append(p[0].lower())
            
    e_pos = []
    e_neg = []
    s_pos = []
    s_neg = []

    for d in e_visualization:
        #length = len(d)
        #if(length == 8):
        if('anger' in d and d['anger'] == 0):
            words = findAnger(allowed_words)
            e_neg.append(words)
        else:
            words = findAnger(allowed_words)
            e_pos.append(words)
            

        if('ant' in d and d['ant'] == 0):
            words = findAnt(allowed_words)
            e_neg.append(words)
        else:
            words = findAnt(allowed_words)
            e_pos.append(words)
            

        if('disgust' in d and d['disgust'] == 0):
            words = findDisgust(allowed_words)
            e_neg.append(words)
        else:
            words = findDisgust(allowed_words)
            e_pos.append(words)
            

        if('fear' in d and d['fear'] == 0):
            words = findFear(allowed_words)
            e_neg.append(words)
        else:
            words = findFear(allowed_words)
            e_pos.append(words)


        if('joy' in d and d['joy'] == 0):
            words = findJoy(allowed_words)
            e_neg.append(words)
        else:
            words = findJoy(allowed_words)
            e_pos.append(words)


        if('sadness' in d and d['sadness'] == 0):
            words = findSadness(allowed_words)
            e_neg.append(words)
        else:
            words = findSadness(allowed_words)
            e_pos.append(words)


        if('surprise' in d and d['surprise'] == 0):
            words = findSurprise(allowed_words)
            e_neg.append(words)
        else:
            words = findSurprise(allowed_words)
            e_pos.append(words)


        if('trust' in d and d['trust'] == 0):
            words = findTrust(allowed_words)
            e_neg.append(words)
        else:
            words = findTrust(allowed_words)
            e_pos.append(words)
                
        #if(length == 2):
        if('positivity' in d and d['positivity'] == 0):
            words = findPos(allowed_words)
            e_neg.append(words)
        else:
            words = findPos(allowed_words)
            e_pos.append(words)


        if('negativity' in d and d['negativity'] == 0):
            words = findNeg(allowed_words)
            e_neg.append(words)
        else:
            words = findNeg(allowed_words)
            e_pos.append(words)

    for d in s_visualization:
        #length = len(d)
        #if(length == 8):
        if('anger' in d and d['anger'] == 0):
            words = findAnger(allowed_words)
            s_neg.append(words)
        else:
            words = findAnger(allowed_words)
            s_pos.append(words)
                

        if('ant' in d and d['ant'] == 0):
            words = findAnt(allowed_words)
            s_neg.append(words)
        else:
            words = findAnt(allowed_words)
            s_pos.append(words)
                

        if('disgust' in d and d['disgust'] == 0):
            words = findDisgust(allowed_words)
            s_neg.append(words)
        else:
            words = findDisgust(allowed_words)
            s_pos.append(words)
                

        if('fear' in d and d['fear'] == 0):
            words = findFear(allowed_words)
            s_neg.append(words)
        else:
            words = findFear(allowed_words)
            s_pos.append(words)


        if('joy' in d and d['joy'] == 0):
            words = findJoy(allowed_words)
            s_neg.append(words)
        else:
            words = findJoy(allowed_words)
            s_pos.append(words)


        if('sadness' in d and d['sadness'] == 0):
            words = findSadness(allowed_words)
            s_neg.append(words)
        else:
            words = findSadness(allowed_words)
            s_pos.append(words)


        if('surprise' in d and d['surprise'] == 0):
            words = findSurprise(allowed_words)
            s_neg.append(words)
        else:
            words = findSurprise(allowed_words)
            s_pos.append(words)


        if('trust' in d and d['trust'] == 0):
            words = findTrust(allowed_words)
            s_neg.append(words)
        else:
            words = findTrust(allowed_words)
            s_pos.append(words)

        #if(length == 2):
        if('positivity' in d and d['positivity'] == 0):
            words = findPos(allowed_words)
            s_neg.append(words)
        else:
            words = findPos(allowed_words)
            s_pos.append(words)


        if('negativity' in d and d['negativity'] == 0):
            words = findNeg(allowed_words)
            s_neg.append(words)
        else:
            words = findNeg(allowed_words)
            s_pos.append(words)

    final_visualization = {'e_pos':e_pos, 'e_neg':e_neg, 's_pos':s_pos, 's_neg':s_neg}
    return final_visualization
コード例 #39
0
#Create dataframe and store the data from IMDB_Dataset.csv
data = pd.DataFrame()
data = pd.read_csv('IMDB_Dataset.csv', encoding='utf-8')
data.head()

#create empty list
review_data_list = list()

indv_lines = data['review'].values.tolist()
for line in indv_lines:

    #create word tokens as well as remove puntuation in one go
    rem_tok_punc = RegexpTokenizer(r'\w+')

    tokens = rem_tok_punc.tokenize(line)

    #convert the words to lower case
    words = [w.lower() for w in tokens]

    #Invoke all the english stopwords
    stop_word_list = set(stopwords.words('english'))

    #Remove stop words
    words = [w for w in words if not w in stop_word_list]

    #Append words in the review_data_list list.
    review_data_list.append(words)
len(review_data_list)

#Train a Word2Vec model using Gensim
    # Replacing or removing emojis.
    text = demoji.replace(text, " ")

    # Lowercasing.
    text = text.lower()

    # Removing punctuation.
    text = rm_punctuation(text)

    # Removing stopwords - i.e. the, a, an, he.
    text = rm_stopwords(text)

    # ????some negation handling - how to keep the negation meaning????

    # Tokenization.
    text = tokenizer.tokenize(text)

    # Removing repeating letters (i.e. awesooome to awesome, *)
    #text = correct_text(text)

    # lemmatizing the words
    text = lemmatize_text(text)

    # Puts the cleaned, tokenized text data back into the data frame.
    data['text'].loc[i] = text

# Removing all the rows that are empty in the text column after cleaning
data = data.replace("", np.nan).replace([], np.nan).dropna()

########################################### ADDING COLUMNS TO DATA ###########################################
data['states'] = data['place_full_name'].apply(lambda row: get_state(row))
コード例 #41
0
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import RegexpTokenizer  #Importing module for sentence tokenisation
from nltk.stem import PorterStemmer
stemmer = PorterStemmer()
tokenizer = RegexpTokenizer(
    "[\w']+")  #Function to tokenise from regular expression
lemmatizer = WordNetLemmatizer()
sentence = "You would need to add materials that you need to use. You also would want to know how much vinegar you should pour in the cups. You should also say what you should label the container with if it should be the sample or letters like A"

arr = []
arr1 = []
sentence = tokenizer.tokenize(sentence)

for i in sentence:
    j = stemmer.stem(i)
    arr.append(j)
    k = lemmatizer.lemmatize(i)
    arr1.append(k)

print(sentence)
print('\n')
print("--------------------------------------------------------")
print('\n')
print(arr)
print('\n')
print("*********************************************************")
print('\n')
print(arr1)
# create English stop words list
en_stop = stopwords.words('english')

# Create p_stemmer of class PorterStemmer
p_stemmer = PorterStemmer()

# list for tokenized documents in loop
texts = []

# loop through document list
for i in doc_set:

    # clean and tokenize document string
    raw = i.lower()
    tokens = tokenizer.tokenize(raw)

    # remove stop words from tokens
    stopped_tokens = [i for i in tokens if not i in en_stop]

    # stem tokens
    stemmed_tokens = [p_stemmer.stem(i) for i in stopped_tokens]

    # add tokens to list
    texts.append(stemmed_tokens)

#print(texts)
# turn our tokenized documents into a id <-> term dictionary
dictionary = corpora.Dictionary(texts)
#print(dictionary.token2id)
コード例 #43
0
ファイル: analyzer.py プロジェクト: shashmaxus/mlivos
    def preprocessor(self, text, max_words=0):
        env = Environment()
        t_start = timer()
        text2 = text.lower()
        env.debug(1, ['Analyzer', 'preprocessor', 'START Preprocessing:'])
        tokenizer = RegexpTokenizer(self.word_tokenizers_custom())
        tokens_words = tokenizer.tokenize(text2)  # Слова текста
        tokens_sent = sent_tokenize(
            text2)  # Предложения - пока не используются в нашем проекте

        n_words_count = len(tokens_words)  # Количество слов в тексте
        n_sent_count = len(tokens_sent)  # Количество предложений в тексте
        n_sent_len_mean = n_words_count / n_sent_count  # Средняя длина предложения в словах

        #Делим текст на части - chunks
        awords = []  #Массив
        # Если документ большой, разделяем его на несколько частей (chunks) и считаем
        # статистику для каждого в отдельности.
        # Это нам позволит имея небольшое число объёмных документов корректно обучить модель
        if (max_words > 0):
            n_sent_chunk = int(
                max_words // n_sent_len_mean
            )  #Сколько предложение в 1 chunks содержащее max_words

            print('n_sent_chunk', n_sent_chunk)
            #подбираем, чтобы текст был разделен равномерно
            i_chunks = 1
            tmp_sent_chunk = n_sent_count
            while tmp_sent_chunk > n_sent_chunk:
                i_chunks = i_chunks + 1
                tmp_sent_chunk = int(
                    math.ceil(n_sent_count // i_chunks) +
                    (n_sent_count % i_chunks))

            n = 0
            n_sent_chunk = tmp_sent_chunk  #итоговое значение сколько предложений пойдет в chunk
            print('tmp_sent_chunk', tmp_sent_chunk)

            while n < n_sent_count:
                #print(n, n_sent_chunk)
                asents = tokens_sent[
                    n:n + n_sent_chunk]  #Предложения от n до n+chunk
                #print(asents)
                a_sent_words = []  #слова текущей группы предложений
                for sent in asents:
                    words = tokenizer.tokenize(sent)
                    a_sent_words.extend(words)
                #print(a_sent_words)
                awords.append([
                    n_sent_count, n_words_count,
                    len(a_sent_words) / len(asents),
                    len(asents),
                    len(a_sent_words), a_sent_words
                ])
                n = n + n_sent_chunk
        else:
            awords.append([
                n_sent_count, n_words_count, n_sent_len_mean,
                len(tokens_sent),
                len(tokens_words), tokens_words
            ])
        #print(awords)
        t_end = timer()
        env.debug(1, ['Preprocessed:', 'time:', env.job_time(t_start, t_end)])
        return awords  #Массив со словами и статистикой
コード例 #44
0
ファイル: jvalin_P1.py プロジェクト: jvalin17/Python-Programs
def querydocsim(query, filename):

    qt = []
    ch = []
    ft = []
    fv = []
    qd = []
    etq = []
    qdv = []
    et = []
    et1 = []
    eb = []
    qdv = []

    qry = query

    tokenizer = RegexpTokenizer(r'[a-zA-Z]+')
    tokns = tokenizer.tokenize(qry)

    ch = sorted((stopwords.words('english')))

    tokns = [w for w in tokns if not w.lower() in ch]

    for p in range(len(tokns)):
        stemmer = PorterStemmer()
        tokns[p] = stemmer.stem(tokns[p])

    cin = Counter(tokns)

    qt = list(cin.keys())

    for i in range(len(qt)):
        et.append(qt[i])

    for i in range(len(et)):
        et1.append(0)

    qd = list(cin.values())

    for i in range(len(qd)):
        eb.append(qd[i])
    #idf of particular word
    for l in range(len(visited)):
        for j in range((len(et))):
            if et[j] == visited[l]:
                et1[j] = vm[l]

    #tf idf product
    for i in range(len(qd)):
        sq = et1[i] * (1 + math.log10(eb[i]))
        etq.append(sq)

    qv = 0
    #vector normalization of query
    for j in range(len(eb)):
        qv = qv + math.pow(etq[j], 2)
    for j in range(len(eb)):
        cq = etq[j] / math.sqrt(qv)
        qdv.append(cq)

    fl = filename

    if fl in v:
        for j in range(len(v)):
            if fl == v[j]:
                h = j
    else:
        return (0)

    rc = []
    rt = cx[h - 1]
    ru = cx[h]

    for i in range(rt, ru - 1):
        ft.append(q[i])
        fv.append(cdv[i])

    for i in range(len(et)):
        fry = et[i]
        if fry in ft:
            for j in range(len(ft)):
                if fry == ft[j]:
                    rc.append(qdv[i] * fv[j])

        else:
            rc.append(0)

    so = 0
    cs = 0
    for pr in range(len(rc)):
        so = rc.pop()
        cs = cs + so

    return (cs)
コード例 #45
0
ファイル: DCM.py プロジェクト: sondn141/ManiGAN
def gen_example(wordtoix, algo, imsize, image_transform, norm, data_dir):
    '''generate images from example sentences'''
    from nltk.tokenize import RegexpTokenizer
    filepath = '%s/example_filenames.txt' % (cfg.DATA_DIR)
    data_dic = {}
    with open(filepath, "r") as f:
        filenames = f.read().split('\n')
        for name in filenames:
            if len(name) == 0:
                continue

            flip = random.rand() > 0.5
            new_w = new_h = int(256 * 76 / 64)
            x = random.randint(0, np.maximum(0, new_w - 256))
            y = random.randint(0, np.maximum(0, new_h - 256))

            img_name = name.replace("text", "images")
            img_path = '%s/%s.jpg' % (data_dir, img_name)
            imgs = get_imgs(img_path, imsize, flip, x, y,
                        None, image_transform, norm)

            real_imgs = []
            for i in range(len(imgs)):
                if cfg.CUDA:
                    real_imgs.append(Variable(imgs[i]).cuda())
                else:
                    real_imgs.append(Variable(imgs[i]))

            filepath = '%s/%s.txt' % (cfg.DATA_DIR, name)
            with open(filepath, "r") as f:
                print('Load from:', name)
                sentences = f.read().split('\n')
                # a list of indices for a sentence
                captions = []
                cap_lens = []
                for sent in sentences:
                    if len(sent) == 0:
                        continue
                    sent = sent.replace("\ufffd\ufffd", " ")
                    tokenizer = RegexpTokenizer(r'\w+')
                    tokens = tokenizer.tokenize(sent.lower())
                    if len(tokens) == 0:
                        print('sent', sent)
                        continue

                    rev = []
                    for t in tokens:
                        t = t.encode('ascii', 'ignore').decode('ascii')
                        if len(t) > 0 and t in wordtoix:
                            rev.append(wordtoix[t])
                    captions.append(rev)
                    cap_lens.append(len(rev))
            max_len = np.max(cap_lens)

            sorted_indices = np.argsort(cap_lens)[::-1]
            cap_lens = np.asarray(cap_lens)
            cap_lens = cap_lens[sorted_indices]
            cap_array = np.zeros((len(captions), max_len), dtype='int64')
            for i in range(len(captions)):
                idx = sorted_indices[i]
                cap = captions[idx]
                c_len = len(cap)
                cap_array[i, :c_len] = cap
            key = name[(name.rfind('/') + 1):]
            data_dic[key] = [cap_array, cap_lens, sorted_indices, real_imgs]
    algo.gen_example(data_dic)
コード例 #46
0
import re
import scipy
from nltk.tokenize import RegexpTokenizer
import numpy as np
import scipy.sparse
import pickle as pickle
from scipy.sparse.linalg import svds
import math
# I will be using 1 late day
# mp3564

# Preprocessing
with io.open('data/brown.txt', 'r') as dataset:
    data = dataset.readlines()
    tokenizer = RegexpTokenizer(r'\w+')
    sent_all = [tokenizer.tokenize(datapoint.lower()) for datapoint in data]

    unique_set = set(word for sent in sent_all for word in sent)
    unique = sorted(list(unique_set))
    vocab_size = len(unique_set)

# Word2Vec Model

model = Word2Vec(sent_all, size=300, window=2, negative=5)
word_vectors = model.wv

word_vectors.save("modelswv.kv")
word_vectors = KeyedVectors.load("modelswv.kv")


# SVD Helper Functions
コード例 #47
0
            # ignore captions with manually selected bad words
            if check_bad_words(lemmatized_list) == False:
                continue

            # analys word composition
            unigram_list, bigram_list = update_ngram_freq(lemmatized_list)

            # ignore meaningless captions
            if len(unigram_list) == 0 or len(bigram_list) == 0:
                continue

            sentence = {}
            # sentence['raw'] = cap # raw caption
            sentence['clean'] = reduced_cap # cleaned caption
            sentence['tokens'] = tokenizer.tokenize(reduced_cap) # tokens

            sentence['unigrams'] = unigram_list
            sentence['bigrams'] = bigram_list

            img["sentences"].append(sentence)


        img["filename"] = imgID # filename of image
        img["url"] = raw_data[imgID]["image_url"] # download url of image

        image_list.append(img)

    # print most common unigrams and bigrams with their frenquency
    # print ('\n'.join([i + '\t'+ str(j) for i, j in Counter(unigram_dict).most_common()]))
    # print ('\n'.join([i + '\t' + str(j) for i, j in Counter(bigram_dict).most_common()]))
コード例 #48
0
from nltk.tokenize import TreebankWordTokenizer

tbwt = TreebankWordTokenizer()

print(tbwt.tokenize(english_text))

complex_text = "This is a free country, isn't it?"

print(tbwt.tokenize(complex_text))

#Reg Exp tokenize
from nltk.tokenize import RegexpTokenizer

reg = RegexpTokenizer("[a-zA-Z0-9\']+")

print(reg.tokenize(complex_text))

#Stop words removal
from nltk.corpus import stopwords

sw = set(stopwords.words('english'))

print(sw)

out = [
    word.lower() for word in reg.tokenize(complex_text)
    if word.lower() not in sw
]
print(out)

#Stemming
train, test = train_test_split(df, test_size=0.33, random_state=42)

columnsData = df.loc[:, 'comment_text']
columnsDataTrain = train.loc[:, 'comment_text']
columnsDataTest = test.loc[:, 'comment_text']

#Total column Data
yo = columnsData.unique()
setyo = set(yo)
mylist = list(setyo)
str1 = ''.join(mylist)
tokenizer = RegexpTokenizer(r'\w+')

#Length column Data
wordLength = len(tokenizer.tokenize(str1))
wordList = tokenizer.tokenize(str1)
unique = set(wordList)
uniqueWordList = list(unique)
uniqueLength = len(uniqueWordList)

#Total Train Data
yo1 = columnsDataTrain.unique()
setyo1 = set(yo1)
mylist1 = list(setyo1)
str2 = ''.join(mylist1)
tokenizer1 = RegexpTokenizer(r'\w+')

#Length Train Data
wordLength1 = len(tokenizer1.tokenize(str2))
wordList1 = tokenizer1.tokenize(str2)
コード例 #50
0
p_stemmer = PorterStemmer()

# Create English stop words
stopset = stopwords.words('english')

#check if word contains digits
def isContainPorD(s):
    return re.search(r'(\d)', s)

#loop through document list
for index,docu in enumerate(data):
    if index>4000:
        break
    content=docu["Content"].lower()
    #remove stop words and digits and punctuations
    removed_tokens = [i for i in tokenizer.tokenize(content) if i not in stopset and not isContainPorD(i)]
    #stem tokens
    tokens = [p_stemmer.stem(i) for i in removed_tokens]
    #preprocessed texts
    texts.append(tokens)


num_of_testcase=50# testcase number
test_data=texts[:num_of_testcase]

input_data=[]
extra_data=[]
for text in test_data:
    # random.shuffle(text)
    input_data.append(text[:len(text)/2])
    extra_data.append(text[len(text)/2:])
コード例 #51
0
response = requests.get(url)
raw = response.text
# lowercase the text
raw = raw.lower()


print "type of text", type(raw)
print "length of text:", len(raw)
print "first 100 characters:", raw[:100]
print "=========="

# create a version of the text without puncuation
from nltk.tokenize import RegexpTokenizer
tokenizer = RegexpTokenizer(r'\w+')
raw_nonpunct = tokenizer.tokenize(raw)

### want to replace contractions before tokenizing
#replacer = RegexpReplacer()
#raw_replaced = nltk.word_tokenize(replacer.replace(raw))

### Now, to tokenize the text
sent_tokens = nltk.sent_tokenize(raw)
word_tokens = nltk.word_tokenize(raw)

# filter the text for stopwords
from nltk.corpus import stopwords
english_stops = set(stopwords.words('english'))
word_tokens_clean = [word for word in raw_nonpunct if word not in english_stops]
COMC_text = nltk.Text(word_tokens_clean)
raw_text = nltk.Text(word_tokens)
コード例 #52
0
with open('data/twitter_tweets_pruned.json') as data:
    identifier = LanguageIdentifier.from_modelstring(model, norm_probs=True)
    nltk.download('stopwords')
    nltk.download('punkt')
    tokenizer = RegexpTokenizer(r'\w+')
    data = json.load(data)
    copy = {}
    stop_words = set(stopwords.words('english'))
    i = 0
    porter = PorterStemmer()

    # stop word removal
    for key, values in data.items():
        copy[key] = []
        for val in values:
            tweet = tokenizer.tokenize(val)
            filtered_tweet = []
            for w in tweet:
                if w not in stop_words:
                    w = porter.stem(w)
                    filtered_tweet.append(w)
            tmp = " ".join(str(x) for x in filtered_tweet)
            tmp2 = tmp.encode('ascii', 'ignore').decode("utf-8")
            lang = (identifier.classify(tmp2))[0]
            if lang == "en":
                copy[key].append(tmp)
        i = i + 1
        print(i)

    with open('data/twitter_tweets_no_unicode_eng.json', 'w') as output1:
        json.dump(copy, output1)
コード例 #53
0
porter = PorterStemmer()                                                        #Stemming on the words using porter's algorithm                                              
stop_words=set(stopwords.words('english'))

inverted_index={}
file_name_ID={}

path='D:/MTECH/SEM 2/Information Retrieval/Assignments/Assignment 1/20_newsgroups'  #Path to the document collection
i=0
words_list=[]
for root,dirs,files in os.walk(path,topdown=False): 
            for name in files:
                directory=root.split("\\",1)[1]
                file_name_ID[i]=directory+'/'+name
                path_file=os.path.join(root,name)
                header_processed_string=header_preprocess(path_file)
                words=tokenizer.tokenize(header_processed_string)               # To remove punctuation mark, comma etc and to form tokens
                for word in words:
                    if (word not in stop_words):                                # Removing Stop Words
                        word=word.lower()                                       # Normalization
                        try:
                            stemmed_word=porter.stem(word)
                            stemmed_word = unicode(stemmed_word, errors='ignore')
                        except:
                            pass
                        words_list.append(stemmed_word)
                        if (stemmed_word not in inverted_index):
                            inverted_index[stemmed_word]=[]
                        if i not in inverted_index[stemmed_word]:
                            inverted_index[stemmed_word].append(i)
                i+=1
コード例 #54
0
def prepare_data(filepath, num_data_points=40000, vocab_size=4000, max_length=500):
    train_set_proportion = 0.9
    train_size = int(num_data_points * train_set_proportion)

    print("Preparing Data...")
    current_file = open(filepath, "rb")
    x = current_file.read()
    current_file.close()

    x = x.decode("utf-8")
    x = x.splitlines()
    random.shuffle(x)
    x = x[:num_data_points]
    labels = []
    reviews = []

    reTokenizer = RegexpTokenizer(r'\w+')

    for i in x:
        separated = i.split(" ", 1)
        labels.append(separated[0])
        reviews.append(separated[1])

    for i in range(len(labels)):
        labels[i] = int(labels[i] == '__label__1')

    all_words = []
    for i in range(len(reviews)):
        tokens = reTokenizer.tokenize(reviews[i])
        reviews[i] = []
        for word in tokens:
            word = word.lower()
            all_words.append(word)
            reviews[i].append(word)

    vocab_pickle_location = os.path.join(vocab_directory, "all_words.pkl")

    if not os.path.isdir(vocab_directory):
        print("Error: vocab_directory doesn't exist!")
    else:
        all_words = pickle.load(open(vocab_pickle_location, 'rb'))
        all_words = all_words[:vocab_size]

    word2int = {all_words[i][0]: i + 1 for i in range(vocab_size)}

    # int2word = {x: y for y, x in word2int.items()}
    # dict_as_list = list(word2int)

    def review2intlist(rev_text):
        int_list = []
        for word in rev_text:
            if word in word2int.keys():
                int_list.append(word2int[word])
        return int_list

    X = []
    for i in range(len(reviews)):
        X.append(review2intlist(reviews[i]))
    X = sequence.pad_sequences(X, maxlen=max_length)

    LSTM_inputs = np.zeros(shape=(max_length, num_data_points), dtype=np.float32)
    for i in range(num_data_points):
        LSTM_inputs[:, i] = X[i]
    LSTM_inputs = LSTM_inputs.T

    LSTM_outputs = np.zeros(shape=num_data_points)
    for i in range(num_data_points):
        LSTM_outputs[i] = labels[i]

    x_train, y_train = LSTM_inputs[:train_size], LSTM_outputs[:train_size]
    x_test, y_test = LSTM_inputs[train_size:], LSTM_outputs[train_size:]

    half_test_size = int(len(y_test)/2)
    x_valid = x_test[:half_test_size]
    y_valid = y_test[:half_test_size]
    x_test = x_test[half_test_size:]
    y_test = y_test[half_test_size:]

    print("Finished preparing data...")
    return x_train, y_train, x_test, y_test, x_valid, y_valid
コード例 #55
0
import nltk
from nltk.tokenize import RegexpTokenizer

puncts_exceptapostrophe = '!"#$%&()*+,-./:;<=>?@[\]^`{|}~'
TOKENIZE_PATTERN = fr"[{puncts_exceptapostrophe}]|\w+|['\w]+"
regex_tokenizer = RegexpTokenizer(pattern=TOKENIZE_PATTERN)

output_file = open("output.txt", "w")

with open('tr.txt', "r") as reader:
    lines = reader.readlines()
    for line in lines:
        line = line.lower()
        tokens = regex_tokenizer.tokenize(line)
        sentence = " ".join(tokens)
        output_file.write(sentence + "\n")

output_file.close()
コード例 #56
0
def dataset():
    global tokens_triples
    global num_sentences
    global included_sentences
    global processed_facts
    global num_triples
    global num_tokens
    global num_annotations
    global unique_predicates
    global predicates
    global dictionary

    if os.path.exists(exp_dir + 'experiment.csv'):
        os.remove(exp_dir + 'experiment.csv')
    tokenizer = RegexpTokenizer(r'\w+')
    with open(exp_dir + 'experiment.csv', 'wb') as exp_file:
        fieldnames = ['Sentence']
        writer = csv.DictWriter(exp_file, fieldnames=fieldnames)
        writer.writeheader()

        for filename in glob.glob(os.path.join(csv_dir, '*.csv')):
            #print filename
            with open(filename, 'rb') as csv_file:
                csv_reader = csv.reader(csv_file)
                csv_reader.next()
                rows_flag = 0
                with open(
                        filename.replace('CSV', 'XML').replace('csv', 'xml'),
                        'r') as xml_file:
                    sentence_number = -1
                    xml = xml_file.read()
                    xml = unicode(xml, 'ascii', errors='ignore')
                    root = ET.fromstring(xml)

                    for row in csv_reader:
                        sentence_number = sentence_number + 1
                        num_sentences = num_sentences + 1
                        """
                        if len(root[sentence_number][5]) >= 1 and row[0].find('?') == -1:
                            rows_flag = rows_flag + 1
                            dictionary.append({'annotated_sentence': root[sentence_number][2].text, 'value': root[sentence_number][0].text, 'triples': [], 'simplification': ''})
                            for triple in range(0, len(root[sentence_number][5])):
                                dictionary[sum(included_sentences)]['triples'].append(root[sentence_number][5][triple].text)

                            included_sentences[0] = included_sentences[0] + 1
                            writer.writerow({'Sentence': row[0]})
                            num_triples.append(len(root[sentence_number][5]))
                            num_tokens.append(len(tokenizer.tokenize(root[sentence_number][0].text)))    
                            tokens_triples.append(len(tokenizer.tokenize(root[sentence_number][0].text)) / len(root[sentence_number][5]))
                            processed_facts = processed_facts + len(root[sentence_number][5])
                        """

                        if len(root[sentence_number][5]) >= 1:
                            if row[0].find('?') == -1 and (
                                    len(
                                        tokenizer.tokenize(
                                            root[sentence_number][0].text)) /
                                    len(root[sentence_number][5]) >=
                                    20) and included_sentences[2] < 200:
                                rows_flag = rows_flag + 1
                                dictionary.append({
                                    'annotated_sentence':
                                    root[sentence_number][2].text,
                                    'value':
                                    root[sentence_number][0].text,
                                    'triples': [],
                                    'simplification':
                                    ''
                                })
                                for triple in range(
                                        0, len(root[sentence_number][5])):
                                    dictionary[sum(
                                        included_sentences)]['triples'].append(
                                            root[sentence_number][5]
                                            [triple].text)

                                included_sentences[
                                    2] = included_sentences[2] + 1
                                writer.writerow({'Sentence': row[0]})
                                #print row[0]
                                num_annotations = num_annotations + get_annotations(
                                    filename.rsplit('/', 1)[-1],
                                    root[sentence_number][2].text)
                                for triple in range(
                                        0, len(root[sentence_number][5])):
                                    predicate = get_predicate(
                                        root[sentence_number][5][triple].text)
                                    if predicate not in unique_predicates:
                                        unique_predicates.append(predicate)
                                        predicates[predicate] = 1
                                    else:
                                        predicates[predicate] = predicates[
                                            predicate] + 1
                                num_triples.append(
                                    len(root[sentence_number][5]))
                                num_tokens.append(
                                    len(
                                        tokenizer.tokenize(
                                            root[sentence_number][0].text)))

                                tokens_triples.append(
                                    len(
                                        tokenizer.tokenize(
                                            root[sentence_number][0].text)) /
                                    len(root[sentence_number][5]))
                                #print sentence_number
                                #print xml_file
                                processed_facts = processed_facts + len(
                                    root[sentence_number][5])
                            if row[0].find('?') == -1 and (len(tokenizer.tokenize(root[sentence_number][0].text)) / len(root[sentence_number][5]) < 20) \
                               and len(tokenizer.tokenize(root[sentence_number][0].text)) / len(root[sentence_number][5]) >= 10 and included_sentences[1] < 200:
                                rows_flag = rows_flag + 1
                                dictionary.append({
                                    'annotated_sentence':
                                    root[sentence_number][2].text,
                                    'value':
                                    root[sentence_number][0].text,
                                    'triples': [],
                                    'simplification':
                                    ''
                                })
                                for triple in range(
                                        0, len(root[sentence_number][5])):
                                    dictionary[sum(
                                        included_sentences)]['triples'].append(
                                            root[sentence_number][5]
                                            [triple].text)

                                included_sentences[
                                    1] = included_sentences[1] + 1
                                writer.writerow({'Sentence': row[0]})
                                #print row[0]
                                num_annotations = num_annotations + get_annotations(
                                    filename.rsplit('/', 1)[-1],
                                    root[sentence_number][2].text)
                                for triple in range(
                                        0, len(root[sentence_number][5])):
                                    predicate = get_predicate(
                                        root[sentence_number][5][triple].text)
                                    if predicate not in unique_predicates:
                                        unique_predicates.append(predicate)
                                        predicates[predicate] = 1
                                    else:
                                        predicates[predicate] = predicates[
                                            predicate] + 1

                                num_triples.append(
                                    len(root[sentence_number][5]))
                                num_tokens.append(
                                    len(
                                        tokenizer.tokenize(
                                            root[sentence_number][0].text)))

                                tokens_triples.append(
                                    len(
                                        tokenizer.tokenize(
                                            root[sentence_number][0].text)) /
                                    len(root[sentence_number][5]))
                                #print sentence_number
                                #print xml_file
                                processed_facts = processed_facts + len(
                                    root[sentence_number][5])
                            if row[0].find('?') == -1 and (len(tokenizer.tokenize(root[sentence_number][0].text)) / len(root[sentence_number][5]) < 10) \
                               and len(tokenizer.tokenize(root[sentence_number][0].text)) / len(root[sentence_number][5]) >= 5 and included_sentences[0] < 200:
                                rows_flag = rows_flag + 1
                                dictionary.append({
                                    'annotated_sentence':
                                    root[sentence_number][2].text,
                                    'value':
                                    root[sentence_number][0].text,
                                    'triples': [],
                                    'simplification':
                                    ''
                                })
                                for triple in range(
                                        0, len(root[sentence_number][5])):
                                    dictionary[sum(
                                        included_sentences)]['triples'].append(
                                            root[sentence_number][5]
                                            [triple].text)

                                included_sentences[
                                    0] = included_sentences[0] + 1
                                writer.writerow({'Sentence': row[0]})
                                #print row[0]
                                num_annotations = num_annotations + get_annotations(
                                    filename.rsplit('/', 1)[-1],
                                    root[sentence_number][2].text)
                                for triple in range(
                                        0, len(root[sentence_number][5])):
                                    predicate = get_predicate(
                                        root[sentence_number][5][triple].text)
                                    if predicate not in unique_predicates:
                                        unique_predicates.append(predicate)
                                        predicates[predicate] = 1
                                    else:
                                        predicates[predicate] = predicates[
                                            predicate] + 1

                                num_triples.append(
                                    len(root[sentence_number][5]))
                                num_tokens.append(
                                    len(
                                        tokenizer.tokenize(
                                            root[sentence_number][0].text)))

                                tokens_triples.append(
                                    len(
                                        tokenizer.tokenize(
                                            root[sentence_number][0].text)) /
                                    len(root[sentence_number][5]))
                                #print sentence_number
                                #print xml_file
                                processed_facts = processed_facts + len(
                                    root[sentence_number][5])

                    xml_file.close()
                csv_file.close()
        exp_file.close()
    # It sorts the dictionary of predicates according to the times of occurrence.
    #print sorted(predicates.items(), key=lambda x:x[1], reverse=True)
    print('%d out of the total %d sentences have been included.' %
          (sum(included_sentences), num_sentences))
    print('Total number of facts-triples that have been included: %d' %
          (processed_facts))
    print(
        'Total number of tokens of the sentences that have been included: %d' %
        (sum(num_tokens)))
    print(
        'Total number of arguments of the sentences that have been included: %d'
        % (num_annotations))
    print(
        'Total number of unique predicates of the sentences that have been included: %d'
        % (len(unique_predicates)))
コード例 #57
0
ファイル: rank.py プロジェクト: kuzux/pagerank

###########################################################
#### Preprocess the tweets to get sets of word indices ####
#### a table of words to word indices, and its reverse ####
###########################################################

tokenizer = RegexpTokenizer(r"@?(\w+'\w+)|(\w+)")
tweets = []
word_indices = {}
all_words = []
curr_word_index = 0

with open("tweetset.txt", "r") as tweets_file:
    for line in tweets_file:
        words = tokenizer.tokenize(line)
        curr_tweet = set()
        for word in words:
            word = word.lower()
            if is_stop(word): continue
            if word in word_indices:
                curr_tweet.add(word_indices[word])
            else:
                word_indices[word] = curr_word_index
                all_words.append(word)
                curr_tweet.add(curr_word_index)
                curr_word_index += 1
        tweets.append(curr_tweet)

#######################################################
#### Constructing the adjacency matrix from tweets ####
コード例 #58
0
import nltk
from nltk.tokenize import RegexpTokenizer
#import os
#os.chdir(r"C:\Users\Bertold\Documents\CUNY\Fall 2019\Intro to Computational Linguistics\Final") 

with open("DC_transcript.txt") as fin: 
    transcript = fin.read()
#Two text files are included. Paste above or just change DC to LB. Filenames:
    #LB_transcript.txt    (Lewis Black)
    #DC_transcript.txt    (Dave Chappelle)


regxptokenizer = RegexpTokenizer(r'\w+')

lowercasetext = transcript.lower()
nopuncttxt = regxptokenizer.tokenize(lowercasetext) 

arpabet = nltk.corpus.cmudict.dict()

def phoneme_counter(str):
    Kcount = 0    
    for word in nopuncttxt:
        try:
            print(arpabet[word][0])
        except KeyError:
            pass
        try:
            for j in range(len(arpabet[word][0])):
                try:
                    if arpabet[word][0][j] == "K":
                        Kcount += 1
コード例 #59
0
#train_text+= state_union.raw("1951-Truman.txt")
#train_text+= state_union.raw("1950-Truman.txt")
#train_text+= state_union.raw("1949-Truman.txt")
#train_text+= state_union.raw("1948-Truman.txt")
#train_text+= state_union.raw("1946-Truman.txt")
#train_text+= state_union.raw("1945-Truman.txt")
#train_text+= state_union.raw("1953-Eisenhower.txt")
#train_text+= state_union.raw("1954-Eisenhower.txt")
#train_text+= state_union.raw("1955-Eisenhower.txt")
#train_text+= state_union.raw("1956-Eisenhower.txt")

stop_words = set(stopwords.words("english"))

#Tokenizing the sentence
tokenizer = RegexpTokenizer(r'\w+')
words = tokenizer.tokenize(train_text)

#Stemmer and Lemmatizer instance created
ps = PorterStemmer()
lemmatizer = WordNetLemmatizer()

#The sequence to get the stop words removed from the sentence
filtered_text = []

#Lemmatizing words and adding to the final array if they are not stopwords
for w in words:
    if w not in stop_words:
        w = lemmatizer.lemmatize(w)
        filtered_text.append(w)
#print(filtered_text)
print("Corpus Words: ", len(filtered_text))
    def data_preperation(self):
        print("data_preperation Activated")
        delimiter = RegexpTokenizer('\s+', gaps=True)  # delimiters matching
        self.tokenized_review_list = [
            delimiter.tokenize(i) for i in self.review_list
        ]

        self.tokenized_description_list = [
            delimiter.tokenize(i) for i in self.description_list
        ]

        all_words = []
        max_len_description = 0
        idx = 0

        for recipe in self.tokenized_description_list:
            for word in recipe:
                idx += 1
                if word not in all_words:
                    all_words.append(word)
            if (idx > max_len_description):
                max_len_description = idx
            idx = 0

        max_len_review = 0
        idx = 0
        for recipe in self.tokenized_review_list:
            for word in recipe:
                idx += 1
                if word not in all_words:
                    all_words.append(word)
            if (idx > max_len_review):
                max_len_review = idx
            idx = 0

        self.all_words = all_words
        self.max_len_description = max_len_description
        self.max_len_review = max_len_review

        # zipped = zip(description_list,review_list)

        # Encoding 1
        # encoded_description = [one_hot(d, vocab_size) for d in description_list]
        # encoded_review = [one_hot(d, vocab_size) for d in review_list]
        # print(len(encoded_description[0]))

        # Encoding 2
        self.vocab_size = len(all_words)

        max_words = self.vocab_size + 5
        t = Tokenizer(num_words=max_words)

        # words --> integers
        t.fit_on_texts(self.description_list + self.review_list)
        encoded_des = list(t.texts_to_sequences(self.description_list))
        encoded_rev = list(t.texts_to_sequences(self.review_list))
        self.tokenizer = t

        # Pad-Sequence - Zero Padding
        # self.padded_encoded_description = pad_sequences(encoded_des, maxlen=self.max_len_description, padding='post')
        # self.padded_encoded_review = pad_sequences(encoded_rev, maxlen=self.max_len_review, padding='post')
        self.padded_encoded_description = pad_sequences(
            encoded_des, maxlen=self.max_len_description, padding='pre')
        self.padded_encoded_review = pad_sequences(encoded_rev,
                                                   maxlen=self.max_len_review,
                                                   padding='pre')
        print(self.padded_encoded_description[0])