def process_statuses(uid):
	statuses_list = {}
	in_path = 'Data/'+uid+'/statuses_list.pickle'
	if os.path.exists(in_path):
		f = open(in_path,'rb')
		j = 0
		while True:
			try:
				statuses = pickle.load(f)
				for status in statuses:
					j += 1
					tweet = status.text
					if 

					sents = sent_tokenize(tweet)
					text = ""
					for sent in sents:
						#print("Sent: ", sent)
						sent_text = re.sub(r'RT\s@\w+:\s|@\w+\s|#|http://.*$|http://.*\s|https://.*$|https://.*\s|\n|\\U\w+', "", sent)
						sent_text = highpoints.sub("", sent_text)
						#print(sent_text)
						tokens = word_tokenize(sent_text)
						words = [w.lower() for w in tokens if w.isalpha() or w.isalnum()]
						stop_words = set(stopwords.words('english'))
						filtered_words = [w for w in words if not w in stop_words]
						statuses_list[sent] = filtered_words	#structure: key:integrate sentence, value: filtered_words 
			except EOFError:
				print(j)
				break
	#print("statuses_list: ", statuses_list)
	return statuses_list 
def embed(sentences):
    model = word2vec.load('~/word2vec_models/GoogleNews-vectors-negative300.bin')
    embedded_sentences = []
    tokenized_sentences = []

    max_len = 0
    for sentence in sentences:
        tokenized_sentence = sent_tokenize(sentence)
        tokenized_sentences.append(tokenized_sentence)
        if len(tokenized_sentence) > max_len:
            max_len = len(tokenized_sentence)


    for sentence in sentences:
        tokenized_sentence = sent_tokenize(sentence)
        embedded_words = []
        
        for word in tokenized_sentence:
            try:
                word = model['word']
            except:
                word = np.zeros(300)
            embedded_words.append(word)

        #padding    
        for i in range(max_len - len(embedded_words)):
            embedded_words.append(np.zeros(300))

        embedded_sentences.append(embedded_words)

    embedded_sentences = np.array(embedded_sentences)

    return embedded_sentences
Exemple #3
0
def sentences(a, b):
    """Return sentences in both a and b"""
    asplit = sent_tokenize(a)
    bsplit = sent_tokenize(b)
    # use set again
    same = {x for x in asplit if x in bsplit}
    return list(same)
Exemple #4
0
def load_file_sentences(filepath):
    index = filepath.rfind('/')
    if index < 0:
        sents = sent_tokenize(PlaintextCorpusReader('.', filepath).raw())
    else:
        sents = sent_tokenize(PlaintextCorpusReader(filepath[:index], filepath[index+1:]).raw())
    return sents
Exemple #5
0
def realtime():
    model_parsing()
    data_df=pd.read_csv('Test_Survey.csv')
    data_df.Verbatim=data_df.Verbatim.fillna(0)
    unique_id=data_df['Unique_Id']
    verbatims=data_df['Verbatim']
    data_dict = dict(zip(unique_id, verbatims))
    Results_df=pd.DataFrame(columns=('Unique_id','Sentence', 'category', 'Sentiment'))
    model_df = pd.read_csv('Model_modified_twitter_test.csv')
    for uid,line in data_dict.items(): 
        line=str(line).decode('utf-8',errors='ignore') #To make sure program doesnt run into unicode error. Add errot handling to avoid issues with other formats
        try:
            line_list=tokenize.sent_tokenize(str(line))
            tokenize.sent_tokenize(str(line))
            for line in line_list:
                original_line=line
                for p in list(punctuation):
                    line=line.replace(p,'')
                line=line.lower()
                line_SC=tb.blob.BaseBlob(line)
                line=line_SC.correct()
                line=str(line)
                #print uid
                sentiment_score=sentiment_calc(line)
                
                temp_df=core_classify(line,uid,sentiment_score,model_df,original_line)
                #Results_df = Results_df.append(temp_df)
                
                yield temp_df
        except UnicodeEncodeError:
            temp_df = pd.DataFrame({'Unique_id':[uid],'Sentence':[original_line],'category':['Invalid text data'],'Sentiment':[sentiment_score]})
            yield temp_df
            #Results_df = Results_df.append(temp_df)
    Results_df.to_csv('test_analysis.csv',index=False, encoding = 'utf-8')
def split_sentence_based_on_rules(sent):


    if re.search(r' \.+ ', sent):
        sentences = re.split(r' \.+ ', sent)
    elif re.search(r'@ ---- @', sent):
        sentences = re.split(r'@ ---- @', sent)
    elif re.search(r'\.\w+\:', sent):
        sent = re.sub(r'\.(\w+)\:', r'. \1:', sent)
        sentences = sent_tokenize(sent)
    elif re.search(r'\, as well as', sent):
        sent = sent.replace(', as well as', '. As well as')
        sentences = sent_tokenize(sent)
    elif re.search(r'[a-z\.]+[A-Z][a-z]+:', sent):
        k = re.findall(r' [a-z\.]+([A-Z][a-z]+:)', sent)
        p = chr(ord(max(sent)) + 1)
        sentences = sent.replace(k[0], p + k[0]).split(p)
    elif re.search(r'\; ', sent):
        sent = re.sub(r'\; ', r'. ', sent)
        sentences = sent_tokenize(sent)
    elif re.search(r', and, ', sent):
        sent = sent.replace(', and, ', '. And, ')
        sentences = sent_tokenize(sent)
    elif re.search(r'president\: Wechsler', sent):
        sent = sent.replace(': ', '. ')
        sentences = sent_tokenize(sent)
    elif re.search(r'\, ', sent):
        sentences = re.split(r'\, ', sent)
    else:
        sentences = [sent[:349],sent[350:]]
        print("Using greedy sentence tokenization")

    text_len = [len(sentence) for sentence in sentences]
    return sentences
 def post(self):
     args = parser.parse_args()
     text = {'text': args['text']}
     print text
     print sent_tokenize(text['text'])
     print word_tokenize(text['text'])
     return text['text']
Exemple #8
0
def inputfactx(rev, include_vpr):
    this_business = find_business(rev.bizid)
    this_user = find_user(rev.uid)
    result = [ this_business.stars ]
    if include_vpr:
        result += [ this_user.get_vpr() ]
    result += [
        this_user.reviewCount,
        len(rev.text),
        rev.stars,
        rev.get_days() ]
    if len(rev.text) == 0:
        result += [ 0, 0, 0, 0, 0 ]
    else:
        excount = 0
        for sent in sent_tokenize(rev.text):
            ss = sent.strip()
            if ss.endswith('!'):
                excount += 1
        result += [ excount,
        np.mean([len(sent) for sent in sent_tokenize(rev.text)]),
        len(sent_tokenize(rev.text)),
        len(re.findall('\n\n', rev.text)) + 1,
        len(rev.text.splitlines()[0]) ]
    result += [ this_business.longitude, this_business.latitude ]
    return result
Exemple #9
0
def tokenize_sentences(filename):
	file_dir = docs_dir + str(filename)
	f = open(file_dir, 'r')

	root = ET.parse(f).getroot()
	tags = root.getiterator('str')

	# read the relevant tags
	title_string = ''
	desc_string = ''
	for tag in tags:
		if tag.get('name')  == 'Title' :
			title_string = filter(lambda x: x in string.printable, tag.text.lower().strip())

		elif tag.get('name') == 'Abstract':
			desc_string = filter(lambda x: x in string.printable, tag.text.lower().strip().replace('relevant documents will describe', ''))

	f.close()

	sentences = sent_tokenize(title_string)
	title_words = []
	for s in sentences:
		title_words = title_words + word_tokenize(s)

	sentences = sent_tokenize(desc_string)
	desc_words = []
	for s in sentences:
		desc_words = desc_words + word_tokenize(s)

	
	return (title_words, desc_words)
    def split_reddit_reviews(self,reviews):
        columns = ['Text','Score', 'True']
        #Calculate total number of sentences to fill up the data frame
        count=0
        for index,each_review in reviews.iterrows():

            split_sentences=sent_tokenize(each_review['Text'])
            count+=len(split_sentences)
        print "total number of sentences {}".format(count)

        df = pd.DataFrame(index=range(0,count), columns=columns)
        Text,Score,True=[],[],[]
        for index,each_review in reviews.iterrows():
            split_sentences=sent_tokenize(each_review['Text'])
            actual_tag=each_review['True']
            score_tag=each_review['Score']
            for each_split_sentence in split_sentences:
                Text.append(each_split_sentence)
                Score.append(actual_tag)
                True.append(score_tag)
        print "Count ={} Text.length {}".format(count,len(Text))
        df['Text']=Text
        df['Score']=Score
        df['True']=True
        df.to_csv('../data/reddit_reviews.csv')
def tokenize(text, grams=1):
  wordStems = lambda s: map(stem, word_tokenize(s))
  sentTokens = lambda tok, s: tok + wordStems(s)

  if grams == 1:
    return list(reduce(sentTokens, sent_tokenize(text), [ ]))
  else:
    return list(ngrams(reduce(sentTokens, sent_tokenize(text), [ ]), grams))
def main(param = 0):
    ''' 
    0 for no stem
    1 for porter
    2 for lancaster
    '''
    both_pos_index = {}
    tit_pos_index = {}
    abs_pos_index = {}


    if param == 0:
        path = './NoStemmer/'
    elif param == 1:
        path = './Porter/'
    elif param == 2:
        path = './Lancaster/'

    for i in range(1,1001):
            
        '''open xml file and get abstract and title'''
        try: 
            filename = "./data/%d.xml" %i
            data = open(filename)
        except:
            print "can't open file %s" %filename
            return 0

        docid = filename.split('/')[-1].split('.')[-2]
        
        tree = etree.fromstring(data.read())
    
        title = tree.find('Title').text
        abstract =  tree.find('Abstract').text
    
    
        #####################################################
        # Step2 tokenize and make position index dictionary #
        #####################################################
        '''sentence tokenize'''
        if title != None:
            title = title.replace('[','',1).replace(']','',1)
            titles = [s.replace('&amp;', '') for s in sent_tokenize(title)]
            tit_pos_index = position_index(tit_pos_index, titles, docid, param) 
           
        if abstract != None:
            abstracts = [s.replace('&amp;', '&') for s in sent_tokenize(abstract)] 
            both = titles + abstracts
        else:
            both = titles
            
        both_pos_index = position_index(both_pos_index,both,docid, param)
    '''save position idex to json'''
    
    with codecs.open( './' + path.split('/')[1] + '_both_index' + '.json', mode = 'w') as a:
        json.dump(both_pos_index, a)
Exemple #13
0
def tag_words_by_sentence(input_filename, output_path=''):
#    text = get_file_text(input_filename)
    text = 'Every day I see blue. But the sky is red. Eagles are green'
    sentences = sent_tokenize(text)
#    sentences = sent_tokenize(text)
    word_tokens = [word_tokenize(s) for s in sent_tokenize(text)]
#    word_tokens = nltk.tag.batch_pos_tag(sent_tokenize(text))
    word_pos = nltk.tag.batch_pos_tag(word_tokens)

        
    return
def sentences(a, b):
    """Return sentences in both a and b"""
    a1 = set(sent_tokenize(a))
    b1 = set(sent_tokenize(b))
    ans = []

    for line in a1:
        if line in b1:
            ans.append(line)

    return ans
Exemple #15
0
    def lexical_features(self):
        """ Lexical features
        """
        features = []
        # Add the first token from the top-1st span on stack
        if self.stackspan1 is not None:
            text = self.stackspan1.text
            texts1 = word_tokenize(text)
          #  print texts1
            sent_tokenize_list =sent_tokenize(text)
            wordb = word_tokenize(sent_tokenize_list[0] )
            worde = word_tokenize(sent_tokenize_list[-1] )
       #     print wordb[0]
            features.append(('StackSpan1','BEGIN-WORD-STACK1',wordb[0].lower()))
            features.append(('StackSpan1','BEGIN-END-STACK1',worde[-1].lower()))
            features.append(('StackSpan1','BEGIN-END-WORD-STACK1',wordb[0].lower(),worde[-1].lower()))


        if self.stackspan2 is not None:
            text = self.stackspan2.text
            texts2 = word_tokenize(text)
          #  print texts1
            sent_tokenize_list =sent_tokenize(text)
            wordb = word_tokenize(sent_tokenize_list[0] )
            worde = word_tokenize(sent_tokenize_list[-1] )
       #     print wordb[0]
            features.append(('StackSpan2','BEGIN-WORD-STACK2',wordb[0].lower()))
            features.append(('StackSpan2','BEGIN-END-STACK2',worde[-1].lower()))

        if self.queuespan1 is not None:
            text = self.queuespan1.text
            textq1 = word_tokenize(text)
          #  print texts1
            sent_tokenize_list =sent_tokenize(text)
            wordb = word_tokenize(sent_tokenize_list[0] )
            worde = word_tokenize(sent_tokenize_list[-1] )
       #     print wordb[0]
            features.append(('QueueSpan1','BEGIN-WORD-QUEUE1',wordb[0].lower()))
            features.append(('QueueSpan1','BEGIN-END-QUEUE',worde[-1].lower()))
            features.append(('QueueSpan1','BEGIN-END-WORD-QUEUE1',wordb[0].lower(),worde[-1].lower()))


        if self.stackspan2 is not None and self.stackspan1 is not None:
             features.append(('StackSpan1','LENGTH-STACK1-STACK2',len(texts1),len(texts2)))
        if self.queuespan1 is not None and self.stackspan1 is not None :

            features.append(('StackSpan1','LENGTH-STACK1-QUEUE1',len(texts1),len(textq1)))
       #     features.append(('StackSpan1','POS-START-STACK1-QUEUE1',begins1,beginq1))

        for feat in features:
            yield feat
Exemple #16
0
def parse(body):
    
    contents = []
    if isinstance(body, basestring):
        contents.append(body)
    else:
        contents = body

    sentences = []
    for content in contents:
        sentences.extend([sentence for sentence in sent_tokenize(content) if not str_helper.hasHTMLTag(sentence)])
        
    stop = stopword.get_stopwords()
    tokens = {}

    for sentence in sentences:
        for word in word_tokenize(sentence.lower()):
            if word not in stop and not str_helper.hasNumbers(word) and not str_helper.hasPunctuation(word):
                word = stem.stemming(word)
                tokens.setdefault(word, 0)
                tokens[word] += 1

    wp = pos_tag(tokens.keys())
    words = [row[0] for row in wp]
    tags = [row[1] for row in wp]

    return words, tags
Exemple #17
0
def line_to_sentences(line):
    raw_sentences = sent_tokenize(line.strip())
    sentences = []
    for raw_sentence in raw_sentences:
        if len(raw_sentence) > 0:
            sentences.append(w2v_normalize(raw_sentence))
    return sentences
Exemple #18
0
 def __init__(self, content, remove_punct=True):
     self._tokcont = [word_tokenize(s) for s in sent_tokenize(content)]
     if remove_punct:
         self._tokcont = [[w for w in s if w not in punctuation]
                          for s in self._tokcont[:]]
     # Remove zero-length sentence
     self._tokcont = [s for s in self._tokcont[:] if len(s) > 0]
def markovize(word1, word2, word3, fileid, char_limit=None):   
    
    with open(fileid, encoding='utf-8') as f:
        text = f.read()
    
    sentences = sent_tokenize(text)
    sent_tokens = defaultdict(list)
    for sentence in sentences:
        tokens = re.findall(r"[\w']+|[.,?!:;]", sentence)
        nwise_ = nwise(tokens, n=4)
        if nwise_:
            for token1, token2, token3, token4 in nwise_:
                sent_tokens[token1, token2, token3].append(token4)
    
    too_long = True
    
    while too_long:
        sentence = [word1, word2, word3]
    
        utterance = build_sentence(sentence, sent_tokens)
        len_utterance = len(utterance)
         
        if char_limit != None and len_utterance > char_limit:
            too_long = True
        else:
            too_long = False
            
    return utterance
def split_sentence_from_document(document):
    max_counts = 0
    for sent in tokenize.sent_tokenize(document):
        max_counts = max(max_counts, len(tokenize.wordpunct_tokenize(sent)))
    # if max_counts>4000:
    #     print(document)
    return max_counts
Exemple #21
0
	def map(self, story):
		
		result = []
		print "parsing: %s" % story
		
		# load the grammer file
		# note, the atis grammar will only work with the atis sample sentences 
		atis_grammar = nltk.data.load('file:/roger/nltk_data/grammars/large_grammars/atis.cfg')
		
		# create a new parser with the grammar
		parser = nltk.ChartParser(atis_grammar)

		# split the story into sentence tokens
		sentence_tokens = sent_tokenize(story)
		
		for sentence_token in sentence_tokens:
			
			# split each sentence in to word tokens
			word_tokens = word_tokenize(sentence_token)
			
			# map each possible sentence structure
			for tree in parser.parse(word_tokens):
				print tree
				result.append(tree)

		return result;
	def processKeywordSearch(self):
		searchString = self._args[0]
		while True:
			article = self._taskQueue.get()
			if article == END_OF_QUEUE:
				break
			else:
				articlePathPartList = article['filePath'].split('/')
				articleCompanyCode = articlePathPartList[-3] if articlePathPartList[-2] == 'a' else articlePathPartList[-2]
				articleCompany = self._db.getCompanyByCode(articleCompanyCode)
				articleCompanyName = articleCompanyCode if articleCompany is None else articleCompany['name']
				articleSentenceList = []

				#here, use '|' to combine regex is OK, because sentence is short, will not reduce the performance that much.
				#But in DB search, use iterative way.
				pattern = getPatternByKeywordSearchString(searchString)

				#on sentence level first, if can't find, go to paragraph level.
				for paragraph in [article['headline'], article['byline'], article['leadParagraph'], article['tailParagraph']]:
					sentenceList = sent_tokenize(paragraph)
					for sentence in sentenceList:
						if re.search(pattern, sentence) is not None:
							articleSentenceList.append(sentence.encode('utf-8').strip())
				if not articleSentenceList:
					#search on paragraph level
					for paragraph in [article['headline'], article['byline'], article['leadParagraph'], article['tailParagraph']]:
						if re.search(pattern, paragraph) is not None:
							articleSentenceList.append(paragraph.encode('utf-8').strip())
				lineList = [articleCompanyCode, articleCompanyName, article['filePath'], article['_id'], article['date'], article['sourceName'].strip(), article['byline'].strip(), article['headline'].strip(), '\t'.join(articleSentenceList)]
				self._resultQueue.put(lineList)
def get_stemmed_separate(indeed_reviews_db, glassdoor_reviews_db):
    separate = get_separate_reviews(indeed_reviews_db, glassdoor_reviews_db)
    stemmer = PorterStemmer()
    stemmed_reviews = []
    for review in separate:
        stemmed_reviews.append(' '.join([stemmer.stem(word) for sent in sent_tokenize(review) for word in word_tokenize(sent.lower())]))
    return stemmed_reviews
Exemple #24
0
    def __call__(self):
        '''tokenize sentences, lower cases, replace digits'''        
        text = file(self.inputFile).read().lower()
        text = filter(lambda x: x in printable, text) ## remove non-ascii characters
        sent_tokenize_list = sent_tokenize(text.strip().lower(), "english") ## tokenize documents into sentences, lower case

        for sent_idx in xrange(len(sent_tokenize_list)):
            updated_sent = [] ## a modified sentence
            sent_tokenize_list[sent_idx] = sent_tokenize_list[sent_idx].translate(replace_punctuation) ## remove all punctuation
            sent_tokenize_list[sent_idx] = TreebankWordTokenizer().tokenize(sent_tokenize_list[sent_idx]) ## sent_tokenize_list[sent_idx] is a list of unigrams now

            term_idx = 0
            sentLen = len(sent_tokenize_list[sent_idx])
            while term_idx<sentLen:
                flag = 1
                curr_term = sent_tokenize_list[sent_idx][term_idx]
                if mesh_phrase_idx.get(curr_term):
                    maxPhraseLen = mesh_phrase_idx.get(curr_term) ## the maximum length of phrase starting with the current term
                    for n in xrange(maxPhraseLen,1,-1): ## iterate from n to 2
                        curr_n_gram = " ".join(sent_tokenize_list[sent_idx][term_idx:min(term_idx+n, sentLen)])
                        if mesh_phrase_dict.get(curr_n_gram):
                            updated_sent.append(mesh_phrase_dict.get(curr_n_gram))
                            term_idx+=n # move the pointer
                            flag = 0
                            break
                    if flag:
                        updated_sent.append(curr_term)
                        term_idx+=1
                else:
                    updated_sent.append(curr_term)
                    term_idx+=1
            sent_tokenize_list[sent_idx] = re.sub(r"\b\d+\b", " ", " ".join(updated_sent))## replace isolated digits

        self.__save__(sent_tokenize_list)
    def tokenize(self, document):
        """
        Break text into sentences and each sentence into a list of single words
        Ignore any token that falls into the stopwords set.
        """
        # use sentence tokenizer sent_tokenize from nltk package
        sentences = sent_tokenize(utils.to_unicode(document.lower()))

        # create stemmer of class SnowballStemmer
        stemmer = SnowballStemmer("english")

        for sentence in sentences:
            words = [word
                   for word in utils.tokenize(
                    self.cleanse_text(sentence)
                   )]

            if self.remove_stopwords:
                words = [ 
                         word for word in words 
                         if word not in self.en_stopwords
                        ]

            if self.stemming:
                words = [stemmer.stem(t) for t in words]

            yield words
def stat_reviews(reviews):
    """

    :type reviews: list[Review]
    :param reviews:
    """
    tokenizer = nltk.tokenize.RegexpTokenizer(r'\w+')

    stats = np.zeros(5)
    num_reviews = len(reviews)
    for review in reviews:
        text = review.text
        num_sentences = len(tokenize.sent_tokenize(text))
        num_words = len(tokenizer.tokenize(text.lower()))
        tagged_words = review.tagged_words
        tags_count = Counter(tag for word, tag in tagged_words)
        num_past_verbs = float(tags_count['VBD'])
        num_verbs = tags_count['VB'] + tags_count['VBD'] + tags_count['VBG'] +\
            tags_count['VBN'] + tags_count['VBP'] + tags_count['VBZ']
        ratio = (num_past_verbs + 1) / (num_verbs + 1)

        stats[0] += num_sentences
        stats[1] += num_words
        stats[2] += num_past_verbs
        stats[3] += num_verbs
        stats[4] += ratio

    for index in range(len(stats)):
        stats[index] /= num_reviews

    print('Average sentences:', stats[0])
    print('Average words:', stats[1])
    print('Average past verbs:', stats[2])
    print('Average verbs:', stats[3])
    print('Average past verbs ratio:', stats[4])
Exemple #27
0
    def extract_post_features(self, post):
        #Assume post consists of one string
        post_features = [0, {}, {}]
        sent_tokenized = sent_tokenize(post)
        sent_word_tokenized = [word_tokenize(s) for s in sent_tokenized]
        if self.ngram_features:
            ngrams = len(self.ngram_features.keys()[0])
        for sentence in sent_word_tokenized:
            if self.ngram_features:
                ngramsentence = find_ngrams(sentence, ngrams)
                for ngram in ngramsentence:
                    if ngram in self.ngram_features.keys():
                        if ngram in post_features[2]:
                            post_features[2][ngram] += 1
                        else:
                            post_features[2][ngram] = 1
            for token in sentence:
                if token in self.word_features.keys():
                    if token in post_features[1]:
                        post_features[1][token] += 1
                    else:
                        post_features[1][token] = 1
        modified_list = [(0, 0)]
        for word, number in self.word_features.iteritems():
            if word in post_features[1].keys():
                modified_list.append((number, post_features[1][word]))
        for ngram, number in self.ngram_features.iteritems():
            if ngram in post_features[2].keys():
                modified_list.append((number, post_features[2][ngram]))

       # print modified_list
        return [x[0] for x in modified_list]
    def make_sentences(self):

        """
        Makes sentences from raw documents.
        Each sentence is wrapped up in a sentence class
        :return: None
        """

        # Create parameters for NER and Dependency Parsing a
        # and pass it to the sentence objcet

        # set config file
        config = CP.RawConfigParser()
        config = config
        config.read('config.py')
         # Server for dependency parsing
        server = ServerProxy(JsonRpc20(),TransportTcpIp(addr=("127.0.0.1", 8080), timeout=200.0))

        # Parameters for Named entitye recognition

        # get the classifier and tagger location from config file
        tagger = config.get('NER','tagger') # gets the path of the stanford tagger
        classifier = config.get('NER','classifier') # gets the path of the stanford classifier
        st = StanfordNERTagger(classifier,tagger)


        if self.document == None:
            return

        sent = sent_tokenize(self.document) # contains raw sentences
        for i in range(len(sent)):
            s = Sentence(sent[i],i, server, st) # We also pass the server object and nertagger
            self.sentences.append(s)
def missingCorpus(corpusdir):
    try:
        os.makedirs(corpusdir)
    except OSError:
        if not os.path.isdir(corpusdir):
            raise
    
    try:
        os.makedirs(corpusdir+'/ratings')
    except OSError:
        if not os.path.isdir(corpusdir+'/ratings'):
            raise
    hotel = json.load(open(data_path+file))
    stopset = hotelNameAddress(hotel)
    stopgroup = ""
    for e in stopset:
        stopgroup += e+" "
    stopgroup = stopgroup[0:-1]
    with open(corpusdir+'/stopset.txt', 'w') as fout:
        fout.write(stopgroup)
    revNum = 0
    for review in hotel.get('Reviews'):
        revNum += 1
        contentOut = ""
        overall = review.get('Ratings').get('Overall')
        content = pos_tag_sents([word_tokenize(sentence) for sentence in sent_tokenize(review.get('Content'))])
        with open (corpusdir+'/ratings/OverallRating'+str(revNum)+'.txt', 'w') as fout:
            fout.write(overall)
        with codecs.open(corpusdir+'/Review'+str(revNum)+'.txt', 'w', encoding = "utf-8") as fout:
            for sentence in content:
                for word, pos in sentence:
                    contentOut += word+"/"+pos+" "
                contentOut += '\n'
            fout.write(contentOut)
Exemple #30
0
	def training_ner(self, paragraph, classification):
		sentence = sent_tokenize(paragraph)
		#print paragraph
		
		#result = []
		train = []
		sentence_ne = ""
		# 1. Pemecahan paragraf kedalam kalimat
		for index, data in enumerate(sentence):	
			tokenize = word_tokenize(data)
			div_sentence = []
			for word in tokenize:
				#check_kota = len(list(self.db.cities.find({"kota":re.compile("^"+word+"$", re.IGNORECASE)})))>=1
				check_kota = (self.db.location.find({"$text": {"$search": word.lower()}}).count())>=1
				# print "word : %s, check : %s"%(word,check_kota) 
				if not check_kota:
					#apabila kata bukan kota maka dibuat kata dasar
					sent_stem = self.stemmer.stem(word)
					word = sent_stem
				div_sentence.append(word)
			train.append(" ".join(div_sentence))
			#ket parameter : self.div_sentence_ner(kalimat_dengan_kata_dasar, kalimat_asli, jenis_klasifikasi) 
			sentence_ne = self.div_sentence_ner("".join(train), " ".join(tokenize), classification)
			#result.append(sentence_ne)
			#reset array train agar tidak diikutkan training ner
			train = []

		return sentence_ne
Exemple #31
0
def load_dataset(args,
                 dataset_name,
                 sup_source,
                 num_seed_doc=10,
                 common_words=10000,
                 truncate_doc_len=None,
                 truncate_sent_len=None,
                 with_eval=True):
    data_path = './' + dataset_name
    data, y, class_tree = read_file(dataset_name, with_eval=with_eval)

    np.random.seed(1234)

    data = preprocess_doc(data)
    data = [s.split(" ") for s in data]
    trun_data = [s[:truncate_doc_len] for s in data]
    tmp_list = [len(doc) for doc in data]
    len_max = max(tmp_list)
    len_avg = np.average(tmp_list)
    len_std = np.std(tmp_list)

    print("\n### Dataset statistics - Documents: ###")
    print(f'Document max length: {len_max} (words)')
    print(f'Document average length: {len_avg} (words)')
    print(f'Document length std: {len_std} (words)')

    if truncate_doc_len is None:
        truncate_doc_len = min(int(len_avg + 3 * len_std), len_max)
    print(f"Defined maximum document length: {truncate_doc_len} (words)")
    print(
        f'Fraction of truncated documents: {sum(tmp > truncate_doc_len for tmp in tmp_list) / len(tmp_list)}'
    )

    sequences_padded = pad_docs(trun_data, pad_len=truncate_doc_len)
    word_counts, vocabulary, vocabulary_inv, trim_vocabulary = build_vocab(
        sequences_padded, common_words)
    print(f"Vocabulary Size: {len(vocabulary_inv):d}")
    x = build_input_data(sequences_padded, vocabulary)
    x = np.array(x)

    assign_data_to_nodes(args, x, y, class_tree)

    # Prepare sentences for training LSTM language model
    trun_data = [" ".join(doc) for doc in trun_data]
    flat_data = [tokenize.sent_tokenize(doc) for doc in trun_data]
    flat_data = [sent for doc in flat_data for sent in doc]
    flat_data = [sent for sent in flat_data if len(sent.split(" ")) > 5]
    tmp_list = [len(sent.split(" ")) for sent in flat_data]
    max_sent_len = max(tmp_list)
    avg_sent_len = np.average(tmp_list)
    std_sent_len = np.std(tmp_list)
    if truncate_sent_len is None:
        truncate_sent_len = min(int(avg_sent_len + 3 * std_sent_len),
                                max_sent_len)
    print("\n### Dataset statistics - Sentences: ###")
    print(f'Sentence max length: {max_sent_len} (words)')
    print(f'Sentence average length: {avg_sent_len} (words)')
    print(f"Defined maximum sentence length: {truncate_sent_len} (words)")
    print(
        f'Fraction of truncated sentences: {sum(tmp > truncate_sent_len for tmp in tmp_list) / len(tmp_list)}'
    )
    flat_data = [s.split(" ") for s in flat_data]
    sequences = build_sequence(flat_data, trim_vocabulary, truncate_sent_len)

    perm = np.random.permutation(len(x))
    if sup_source == 'keywords':
        load_keywords(data_path, class_tree)
    elif sup_source == 'docs':
        if dataset_name == 'yelp':
            class_type = 'sentiment'
            num_keywords = 5
        else:
            class_type = 'topic'
            num_keywords = 10
        extract_keywords(data_path, class_tree, class_type, vocabulary,
                         num_seed_doc, num_keywords, data, perm)
    x = x[perm]
    if y is not None:
        if type(y) == dict:
            inv_perm = {k: v for v, k in enumerate(perm)}
            perm_y = {}
            for doc_id in y:
                perm_y[inv_perm[doc_id]] = y[doc_id]
            y = perm_y
        else:
            y = y[perm]
    return x, y, sequences, class_tree, word_counts, vocabulary, vocabulary_inv, len_avg, len_std, perm
Exemple #32
0
    def sentenceTokenizing(self, sentence):

        print(sent_tokenize(sentence))
#import the modules required.
from nltk.corpus import stopwords
import string
from sklearn.feature_extraction.text import CountVectorizer
from nltk.tokenize import sent_tokenize, word_tokenize

#Load the corpus from a text file and tokenize it into sentences.
with open('matter.txt', 'r') as f:
    data = f.read()
Text = sent_tokenize(data)
# Total number of sentences in the data. Prints 14 for this text.
print(len(Text))


#Define the preprocessor routine for the data.
def preprocess(sentence):
    sentence = sentence.lower()
    sentence = "".join([x for x in sentence if x not in string.punctuation])
    sentence = [
        x for x in sentence.split(" ") if x not in stopwords.words('english')
    ]
    sentence = [x for x in sentence if x != '']
    return " ".join(sentence)


# Fit a bag of words estimator and transform the count matrix.
bow_vectorizer = CountVectorizer(lowercase=True, preprocessor=preprocess)
model = bow_vectorizer.fit(Text)
bag_of_words = model.transform(Text)

#Get the frequencies of the words.
Exemple #34
0
        return b


f = open("sample.txt", "r")

data = f.read()

# print(data)
d1 = data.replace("?", "? ")
d2 = d1.replace('.', '. ')
data = d2.replace('!', '! ')
data = data.replace("\n", ' ')

# print(data)

sent_tokenize_list = sent_tokenize(data)
print(sent_tokenize_list)

f = open('sentence.txt', 'w')

for i in sent_tokenize_list:
    f.write(i)
    f.write("\n")

no_of_sents = len(sent_tokenize_list)

table = data.maketrans('', '', string.punctuation)

for i in range(no_of_sents):
    sent_tokenize_list[i] = sent_tokenize_list[i].translate(table)
Exemple #35
0
def tokenize_sentence(file_str):
    """Return a List of Tokens"""
    sent_tokenize_list = sent_tokenize(file_str)
    for i in range(0, 1):
        word_token = word_tokenize(sent_tokenize_list[i])
        return word_token
with open(sys.argv[1], "rb") as csvfile:
    city = csv.reader(csvfile, delimiter=',', quotechar='"')
    counter = 0
    count2 = 0
    for row in city:
        if counter == 0:
            inner_counter = 0
            for f in row:
                fields[f.strip()] = inner_counter
                inner_counter += 1
        else:
            try:
                if len(row[fields['neighbourhood_cleansed']]) > 1:
                    n = row[fields['neighbourhood_cleansed']].strip().lower()
                    d = row[fields['description']]
                    s = sent_tokenize(d)
                    for t in s:
                        search = '%s is' % (n)
                        n_idx = t.lower().find(search)
                        if n_idx > -1:
                            tags = pos_tag(word_tokenize(t[n_idx:]))
                            for tag in tags:
                                if tag[1] == 'JJ':
                                    if n in ngh_adjs:
                                        ngh_adjs[n].append(tag[0])
                                    else:
                                        ngh_adjs[n] = [tag[0]]
            except:
                pass
        counter += 1
for line in o:
    while (nomeCien_re.search(line)):
        line = re.sub(nomeCien_re, ' (taxonomia) ', line)

    while (Universidade_re.search(line)):
        line = re.sub(Universidade_re, ' (Universidade) ', line)

    while (refe_re.search(line)):
        line = re.sub(refe_re, ' ', line)

    while (num_re.search(line)):
        line = re.sub(num_re, ' (numeros) ', line)

    result = ([
        word_tokenize(t, 'portuguese')
        for t in sent_tokenize(line, 'portuguese')
    ])

    sentencas = []
    chunks = []

    for sent in result:
        chunks = []
        for word, tag in tagger2.tag(sent):
            if (word == 'da' or word == 'das' or word == 'do' or word == 'dos'
                    or word == 'na' or word == 'nas' or word == 'no'
                    or word == 'nos'):
                tag = 'PREP'
        chunk = (word + '/' + tag)
        chunks.append(chunk)
    sentencas.append(chunks)
Exemple #38
0
import os

if __name__ == '__main__':

    initial_time = time()
    corruptedFiles = []
    sentencesCount = 0
    processedFiles = 0

    for path in os.listdir('MCE-corpus'):

        try:
            parser = etree.XMLParser(recover=True)
            tree = etree.parse('MCE-corpus/' + path, parser)
            root = tree.getroot()
            sentences = sent_tokenize(root.find('body').text)
            sentencesCount += len(sentences)
            processedFiles += 1
        except:
            corruptedFiles.append(path)

    final_time = time()

    print('The files has been processed.')
    print('Average number of sentences per comment: ' +
          str(round(sentencesCount / processedFiles, 2)))

    if len(corruptedFiles) > 0:
        print('The following files cannnot be processed: \n')
        for x in corruptedFiles:
            print(x)
Exemple #39
0
        # Вычисляем Rouge
        rouge = computeRouge(summary, reference)
        calculate_words(data)

        # Проверка лучше чем предыдущий rouge?
        if rouge > prev_rouge and new_li < 100 and all(isLavg) == True:
            prev_rouge = rouge
            my_string = "Попытка = {}\t Rouge = {}\t Количество слов = {}\t"
            print(my_string.format(t, rouge, new_li))


print("Start ...")
print("Reading document ...")
text = readText("training/AP880310-0257")
reference = "Senators McClure (R) and Metzenbaum (D) have sponsored bills to prevent plastic guns from slipping through airport security.  The gun, not yet manufactured, is intended for military and police use. Metzenbaum's bill would require some detectable metal content; McClure's would require more sensitive detection equipment at airports, almost certainly causing passenger delays. The NRA opposes the first federal gun ban bill in America, and warns members their guns will soon be inspected and weighed by government agents. However, on this issue they will compromise, not to ban the gun, but to increase airport security.  Handgun control advocates and law enforcement officials back Metzenbaum's proposal."
sentences = sent_tokenize(text)
max_n = len(sentences)

l_avg = len(word_tokenize(text)) / len(sentences)

# STAGE 1: Случайно количество предложении
num_sentence = randomNumberSentence(max_n)

print("Нужно {} количество предложении..".format(num_sentence))

vectorizer = CosumTfidfVectorizer()
vectorizer.fit(text)
vector = vectorizer.weight_matrix

print("Вычисляем rouge")
# STAGE 2: Вычисливаем rouge
Exemple #40
0
fi = open('Input5.txt', encoding="utf8")
strss = ""
for line in fi:
    strss = strss + line
#print(strss)


class my_struct():
    def __init__(self, i, j, sim):
        self.i = i
        self.j = j
        self.sim = sim


sents = sent_tokenize(strss)
#print(sents)
l = len(sents)
#print(l)

sim = []
for i in range(l):
    for j in range(l):
        if (j > i):
            cal = similarity(sents[i], sents[j], False)
            sim.append(my_struct(i, j, cal))
            #print("%d %d %.3f" %(i, j, cal))
            #print("%s\t%s\t%.3f\t%.3f" % (sents[i], sents[j], similarity(sents[i], sents[j], False), similarity(sents[i], sents[j], True)))

lsm = len(sim)
sim.sort(key=lambda x: x.sim, reverse=True)
Exemple #41
0
tic = time.time()
categories = [
    "toxic", "severe_toxic", "obscene", "threat", "insult", "identity_hate"
]
train = pd.read_csv("train.csv")
test = pd.read_csv("test.csv")
Y = train[categories].values

train["comment_text"].fillna("no comment", inplace=True)
train["comment_text"] = train["comment_text"].apply(lambda x: clean_corpus(x))

test["comment_text"].fillna("no comment", inplace=True)
test["comment_text"] = test["comment_text"].apply(lambda x: clean_corpus(x))

train["sentences"] = train["comment_text"].apply(
    lambda x: tokenize.sent_tokenize(x))
test["sentences"] = test["comment_text"].apply(
    lambda x: tokenize.sent_tokenize(x))
toc = time.time()
print(toc - tic)

from keras.preprocessing.text import Tokenizer, text_to_word_sequence

raw_text = train["comment_text"]
tk = Tokenizer(num_words=max_features, lower=True)
tk.fit_on_texts(raw_text)


def sentenize(data):
    comments = data["sentences"]
    sent_matrix = np.zeros((len(comments), max_sent, max_text_len),
Exemple #42
0
def main(args, text):
    import_user_module(args)

    if args.buffer_size < 1:
        args.buffer_size = 1
    if args.max_tokens is None and args.max_sentences is None:
        args.max_sentences = 1

    assert not args.sampling or args.nbest == args.beam, \
        '--sampling requires --nbest to be equal to --beam'
    assert not args.max_sentences or args.max_sentences <= args.buffer_size, \
        '--max-sentences/--batch-size cannot be larger than --buffer-size'

    print(args)

    use_cuda = torch.cuda.is_available() and not args.cpu

    # Setup task, e.g., translation
    task = tasks.setup_task(args)

    # Load ensemble
    print('| loading model(s) from {}'.format(args.path))
    models, _model_args = utils.load_ensemble_for_inference(
        args.path.split(':'),
        task,
        model_arg_overrides=eval(args.model_overrides),
    )
    args.copy_ext_dict = getattr(_model_args, "copy_attention", False)

    # Set dictionaries
    src_dict = task.source_dictionary
    tgt_dict = task.target_dictionary

    # Optimize ensemble for generation
    for model in models:
        model.make_generation_fast_(
            beamable_mm_beam_size=None if args.no_beamable_mm else args.beam,
            need_attn=args.print_alignment,
        )
        if args.fp16:
            model.half()
        if use_cuda:
            model.cuda()

    # Initialize generator
    generator = task.build_generator(args)

    # Load alignment dictionary for unknown word replacement
    # (None if no unknown word replacement, empty if no path to align dictionary)
    align_dict = utils.load_align_dict(args.replace_unk)
    if align_dict is None and args.copy_ext_dict:
        align_dict = {}

    max_positions = utils.resolve_max_positions(
        task.max_positions(), *[model.max_positions() for model in models])

    if args.buffer_size > 1:
        print('| Sentence buffer size:', args.buffer_size)
    #print('| Type the input sentence and press return:')
    start_id = 0
    src_strs = []
    results = []
    inputs = tokenize.sent_tokenize(text)
    for batch in make_batches(inputs, args, task, max_positions):
        src_tokens = batch.src_tokens
        src_lengths = batch.src_lengths
        src_strs.extend(batch.src_strs)
        if use_cuda:
            src_tokens = src_tokens.cuda()
            src_lengths = src_lengths.cuda()

        sample = {
            'net_input': {
                'src_tokens': src_tokens,
                'src_lengths': src_lengths,
            },
        }
        translations = task.inference_step(generator, models, sample)
        for i, (id, hypos) in enumerate(zip(batch.ids.tolist(), translations)):
            src_tokens_i = utils.strip_pad(src_tokens[i], tgt_dict.pad())
            results.append((start_id + id, src_tokens_i, hypos))

    # sort output to match input order
    res = ''
    for id, src_tokens, hypos in sorted(results, key=lambda x: x[0]):
        if src_dict is not None:
            src_str = src_dict.string(src_tokens, args.remove_bpe)
            print('S-{}\t{}'.format(id, src_str))

        hypo = hypos[0]
        _, hypo_str, _ = utils.post_process_prediction(
            hypo_tokens=hypo['tokens'].int().cpu(),
            src_str=src_strs[id],
            alignment=hypo['alignment'].int().cpu()
            if hypo['alignment'] is not None else None,
            align_dict=align_dict,
            tgt_dict=tgt_dict,
            remove_bpe=args.remove_bpe,
        )
        if id == 0:
            res = hypo_str
        else:
            res = res + ' ' + hypo_str

    return res
     no_of_sentences = len(sentences)
     pos_tagged_sentence = pos_tagging(sentence)
     for word in pos_tagged_sentence:
          if word.lower() not in Stopwords and word not in Stopwords and len(word)>1: 
                word = word.lower()
                word = wordlemmatizer.lemmatize(word)
                sentence_score = sentence_score + word_tfidf(dict_freq,word,sentences,sentence)
     return sentence_score


# file = r'C:\Users\Dell\Desktop\FYPr-master\summarized.txt'
# file = open(file , 'r', encoding="utf8")
# text = file.read()
text = sys.argv[1]
input_user = int(sys.argv[2])
tokenized_sentence = sent_tokenize(text)
text = remove_special_characters(str(text))
text = re.sub(r'\d+', '', text)
tokenized_words_with_stopwords = word_tokenize(text)
tokenized_words = [word for word in tokenized_words_with_stopwords if word not in Stopwords]
tokenized_words = [word for word in tokenized_words if len(word) > 1]
tokenized_words = [word.lower() for word in tokenized_words]
tokenized_words = lemmatize_words(tokenized_words)
word_freq = freq(tokenized_words)
#input_user = int(input('Percentage of information to retain(in percent):'))
no_of_sentences = int((input_user * len(tokenized_sentence))/100)
#print(no_of_sentences)
c = 1
sentence_with_importance = {}
for sent in tokenized_sentence:
    sentenceimp = sentence_importance(sent,word_freq,tokenized_sentence)
def getSentences(text):
    return sent_tokenize(text)
Exemple #45
0
    'CZK', 'DJF', 'DKK', 'DOP', 'DZD', 'EGP', 'ERN', 'ETB', 'EUR', 'FJD',
    'FKP', 'GBP', 'GEL', 'GHS', 'GIP', 'GMD', 'GNF', 'GTQ', 'GYD', 'HKD',
    'HNL', 'HRK', 'HTG', 'HUF', 'IDR', 'ILS', 'INR', 'IQD', 'IRR', 'ISK',
    'JMD', 'JOD', 'JPY', 'KES', 'KGS', 'KHR', 'KMF', 'KPW', 'KRW', 'KWD',
    'KYD', 'KZT', 'LAK', 'LBP', 'LKR', 'LRD', 'LSL', 'LYD', 'MAD', 'MDL',
    'MGA', 'MKD', 'MMK', 'MNT', 'MOP', 'MRU', 'MUR', 'MVR', 'MWK', 'MXN',
    'MXV', 'MYR', 'MZN', 'NAD', 'NGN', 'NIO', 'NOK', 'NPR', 'NZD', 'OMR',
    'PAB', 'PEN', 'PGK', 'PHP', 'PKR', 'PLN', 'PYG', 'QAR', 'RON', 'RSD',
    'RUB', 'RWF', 'SAR', 'SBD', 'SCR', 'SDG', 'SEK', 'SGD', 'SHP', 'SLL',
    'SOS', 'SRD', 'SSP', 'STN', 'SVC', 'SYP', 'SZL', 'THB', 'TJS', 'TMT',
    'TND', 'TOP', 'TRY', 'TTD', 'TWD', 'TZS', 'UAH', 'UGX', 'USD', 'USN',
    'UYI', 'UYU', 'UYW', 'UZS', 'VES', 'VND', 'VUV', 'WST', 'XAF', 'XAG',
    'XAU', 'XBA', 'XBB', 'XBC', 'XBD', 'XCD', 'XDR', 'XOF', 'XPD', 'XPF',
    'XPT', 'XSU', 'XTS', 'XUA', 'YER', 'ZAR', 'ZMW', 'ZWL'
]
currency_dict = dict({' ': currency_codes})
keyword_processor = KeywordProcessor()
keyword_processor.add_keywords_from_dict(currency_dict)
cleaned_text = keyword_processor.replace_keywords(cleaned_text)
# remove whitespace
cleaned_text = ' '.join(cleaned_text.split())
print('step 6')

sentences = sent_tokenize(cleaned_text)
sentences = list(filter(None, sentences))

output_file_name = input("Please enter the output file name : ")
with open(output_file_name, "w") as f:
    for sentence in sentences:
        f.write(sentence + '\n')
Exemple #46
0
#------------------------------- TOKENIZATION--------------------------------------
from nltk.tokenize import sent_tokenize, word_tokenize
 
text = "Twitter was created in March 2006 by Jack Dorsey,"+ \
       "Noah Glass, Biz Stone, and Evan Williams and launched " +\
       "in July of that year. The service rapidly gained worldwide "+\
       "popularity. In 2012, more than 100 million users posted 340 "+\
       "million tweets a day, and the service handled an average of "+\
       "1.6 billion search queries per day."
 

print "------------------------------- TOKENIZATION--------------------------------------"
print "\nSENTENCE TOKENIZATION: ",sent_tokenize(text)
print "\nWORD TOKENIZATION: ",word_tokenize(text)

#-------------------------STOP WORD REMOVAL--------------------------------------
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
 
stop_words = set(stopwords.words('english'))
stop_words.add('.')
stop_words.add(',')
 
word_tokens = word_tokenize(text)
 
filtered_sentence = [w for w in word_tokens if not w in stop_words]
 
filtered_sentence = []
 
for w in word_tokens:
    if w not in stop_words:
import string
from pprint import pprint
from nltk.corpus import brown
from nltk.tokenize import sent_tokenize
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from nltk.stem.snowball import SnowballStemmer
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.chunk import ne_chunk
from nltk.draw.tree import TreeView

text = '''Another ex-Golden Stater, Paul Stankowski from Oxnard, is contending for a berth on the U.S. Ryder Cup team after winning his first PGA Tour event last year and staying within three strokes of the lead through three rounds of last month's U.S. Open. H.J. Heinz Company said it completed the sale of its Ore-Ida frozen-food business catering to the service industry to McCain Foods Ltd. for about $500 million. It's the first group action of its kind in Britain.'''
print(text)

# Sentence splitting
nltk_sentence_splitted = sent_tokenize(text)
for index, sentence in enumerate(nltk_sentence_splitted, 1):
    print(f'SENTENCE {index}: {sentence}')

# Tokenization
example_sentence = "I'll refuse to permit you to obtain the refuse permit."
tokenized = nltk.word_tokenize(example_sentence)
print(tokenized)

# Part of speech tagging
pos_tagged = nltk.pos_tag(tokenized)
print(pos_tagged)

# Remove stop words
english_stopwords = stopwords.words('english')
set_english_stopwords = set(english_stopwords) # sets are faster to check if an element is in
	return list(set(all_pairs))

#
#testing
#
corpus = "a b. a b c. a b c d. b c. b c d. c d. d c. d c b. d c b a. a b c d e. e d. e d c."
unique_words = get_unique_words(corpus)
n = len(unique_words)
w2id, id2w = w2id_id2w_maps(unique_words)

#create empty cooccurence matrix
A = np.zeros([n,n],np.float32)

#compute cooccurence matrix
sentences = sent_tokenize(corpus)
for s in sentences:
	s = process_text(s)
	max_distance = len(s) + 1
	s = [w2id[w] for w in s]	#convert words to ids

	for d in range(2,max_distance):
		pairs = cooccurence_pair_of_distance(s, d)

		#update cooccurence matrix for each pair
		for p in pairs:
			A[p[0],p[1]] += ngram_inc_amt(d)
			A[p[1],p[0]] += ngram_inc_amt(d)

#finished cooccurence matrix A
print (w2id)
Exemple #49
0
def get_documents_bbc(tree):
    documents = []
    prev_hour = ['00:00']
    articles = tree.xpath(".//article")
    for article in articles:
        # source title
        source_title = article.xpath(
            './/header[@class="lx-stream-post__header gs-o-media"]')
        if len(source_title) == 1:
            source_title = text_normalization(
                BeautifulSoup(html.tostring(source_title[0]),
                              "html.parser").get_text())
        # hour
        hour = re.findall(
            r"[0-9]{2}:[0-9]{2}",
            html.tostring(article))  # get the hour linked to the article
        if not hour:
            hour = prev_hour
        # lines
        lines = article.xpath('.//div[@class="lx-stream-post-body"]//p')

        # text
        text_lines = []
        if len(lines) >= 1:
            for line in lines:
                text_lines.append(
                    BeautifulSoup(html.tostring(line),
                                  "html.parser").get_text())
        # author
        author = article.xpath(
            './/div[@class="lx-stream-post__contributor gs-o-media"]')
        if len(author) == 1:
            author = author[0].xpath(
                ".//p/text()"
            )  # get the description of the author of the article
        else:
            author = ''
        # extract the links form the block

        lines = article.xpath('.//div[@class="lx-stream-post-body"]')
        if len(lines) == 1:
            cont = html.tostring(lines[0])
            links = set(
                re.findall(r'https?://[a-z\.]+/[a-z\-_0-9/]+\.[a-z]{2,4}',
                           cont))
            links = links.union(
                re.findall(r'https?://[A-Za-z\.]+/[A-Za-z\-_0-9/]+', cont))
        else:
            cont = html.tostring(article)
            links = set(
                re.findall(r'https?://[a-z\.]+/[a-z\-_0-9/]+\.[a-z]{2,4}',
                           cont))
            links.union(
                re.findall(r'https?://[A-Za-z\.]+/[A-Za-z\-_0-9/]+', cont))

        try:
            for link in links:
                # print link
                if "https://twitter.com/" in link and "status" in link:
                    # we extract the content from the twitter status
                    twi_page = get(link).text
                    twi_tree = html.fromstring(twi_page)
                    tweets = twi_tree.xpath(
                        '//p[contains(@class, "tweet-text")]')
                    if len(tweets) >= 1:
                        for tweet in tweets:
                            twi_text = BeautifulSoup(html.tostring(tweet),
                                                     "html.parser").get_text()
                            text_lines.append(twi_text)
        except:
            pass

        # retrieving of links in the text
        block_id = article.get("id")

        block_text = [
            sent_tokenize(text_normalization(line.strip()))
            for line in text_lines if line.strip() != u""
        ]
        block_text = list(itertools.chain.from_iterable(block_text))

        if len(block_text) == 1:
            if block_text[0] == '':
                block_text = [source_title]
        if len(block_text) == 0:
            block_text = [source_title]

        d_block = {
            "time": hour[0],
            "text": block_text,
            "block_id": block_id,
            "author": author,
            "title": source_title
        }
        prev_hour = hour
        documents.append(d_block)
    return documents
Exemple #50
0
def geigerize():
    """
    Selects highlights from submitted comments
    using the specified strategy.
    """
    data = request.get_json()
    strat = data['strategy']

    # Wrangle posted comments into the minimal format needed for processing.
    comments = [
        Comment({
            'commentID': c['id'],
            'commentBody': c['body_html'],
            'recommendations': c['score'],
            'userDisplayName': c['author'],
            'createDate': 0,
            'replies': []  # ignoring replies for now
        }) for c in data['comments']
    ]

    results = []
    if config.sentences:
        # Try out sentences as the object
        sentences = [[Sentence(sent, c) for sent in sent_tokenize(c.body)]
                     for c in comments]
        sentences = [s for sents in sentences for s in sents]

        # Run the specified strategy.
        raw_results = getattr(geiger, strat)(sentences)

        # Format results into something jsonify-able.
        for r in raw_results:
            s = r[1]
            results.append({
                'sentence': r[0],
                'comment': {
                    'id': s.comment.id,
                    'body': s.body,
                    'author': s.comment.author
                },
                'support': int(r[2]),
                'cohort': [c.body for c in r[3]]
            })

    else:
        raw_results = getattr(geiger, strat)(comments)

        # Format results into something jsonify-able.
        for r in raw_results:
            comment = r[1]
            results.append({
                'sentence': r[0],
                'comment': {
                    'id': comment.id,
                    'body': comment.body,
                    'author': comment.author
                },
                'support': int(r[2]),
                'cohort': [c.body for c in r[3]]
            })

    return jsonify(results=results)
Exemple #51
0
from nltk import ne_chunk
from collections import Counter
from nltk.util import ngrams
from nltk.stem import PorterStemmer
from nltk.stem import WordNetLemmatizer
#nltk.download('punkt')
#nltk.download('word_tokenize')
#nltk.download('wordpunct_tokenize')
#nltk.download('sent_tokenize')

s = "Hi this is a python program. I am doing the in class exercise. Time given to solve is 300 minutes."
meaning = wn.synsets('program')
for a in meaning:
    print(a.definition())
print([str(syns.definition) for syns in meaning])
x = sent_tokenize(s)
print x
for t in x:
    print word_tokenize(t)

#lemmetization
print "lemmatization:"
lemmatizer = WordNetLemmatizer()
print lemmatizer.lemmatize('cooking')
print lemmatizer.lemmatize('cooking', pos='v')

#stemming
stemmer = PorterStemmer()
print "stemming"
print stemmer.stem('cooking')
#nltk.download('all')
Exemple #52
0
def predict(pathToModels, pageParsedContent):
    import numpy as np
    import pandas as pd
    import requests
    import matplotlib.pyplot as plt
    from bs4 import BeautifulSoup
    data = pd.read_csv(pathToModels + '/Tech/odf_scraped.csv')
    X = data.iloc[:, 0]
    y = data.iloc[:, 1]
    from sklearn.feature_extraction.text import CountVectorizer
    cv = CountVectorizer(max_features=200)
    X_vec = cv.fit_transform(X)
    from sklearn.feature_extraction.text import TfidfTransformer
    tf = TfidfTransformer()
    X_tf = tf.fit_transform(X_vec)
    from sklearn.linear_model import LogisticRegression
    lr = LogisticRegression()
    lr.fit(X_tf, y)
    print('done successfuly till here')
    soup = pageParsedContent
    main_title = soup.find('title')
    main_title = main_title.get_text()
    main_title = main_title.replace(",", " ")
    with open(pathToModels + '/Tech/final_testing.txt', 'w') as f:
        f.write(main_title + '\n')
        f.close()
    with open(pathToModels + '/Tech/final_testing.txt', 'r') as f:
        text_test = f.read()
        f.close()
    text_test = [text_test]
    text_test_cv = cv.transform(text_test)
    text_test_tf = tf.transform(text_test_cv)
    u = lr.predict(text_test_tf)
    print(u)

    if (u == 1):
        return u

    #output = round(prediction[0], 2)
    import nltk
    from nltk.tokenize import word_tokenize
    from nltk.corpus import stopwords
    sw = set(stopwords.words('english'))
    num = []
    for i in range(0, 3000):
        i = str(i)
        num.append(i)
    mx = [
        '(', ')', ',', '.', '?', '#', '@', '!', '[', ']', '<', '>', '/', ' ',
        '|', "''", '...', ':'
    ]
    soup = pageParsedContent
    body_content = soup.body
    for script in body_content(["script", "style"]):
        script.decompose()
    body_content_text = body_content.get_text()
    body_content_text = body_content_text.replace('\n', ' ')
    body_content_text = body_content_text.replace('\t', ' ')
    body_content_text = body_content_text.replace("'", ' ')
    with open(pathToModels + '/Tech/final_scraped_content_test.txt', 'w') as f:
        f.write(body_content_text)
        f.close()
    with open(pathToModels + '/Tech/final_scraped_content_test.txt', 'r') as f:
        new_text = f.read()
        f.close()
    from nltk.tokenize import sent_tokenize
    sent = sent_tokenize(new_text)
    data = []
    for i in sent:
        words = word_tokenize(i)
        for w in words:
            if w not in sw and w not in mx and w not in num:
                data.append(w)
    from collections import Counter
    count = Counter(data)
    count = sorted(count.items(), key=lambda x: x[1], reverse=True)
    tech = [
        'Samsung', 'Xiaomi', 'Lenovo', 'Vivo', 'Oppo', 'Apple', 'LG', 'Nokia',
        'OnePlus', 'HTC', 'Huawei', 'Amazon', 'Flipkart', 'Google', 'Mobile',
        'Smartphones', 'Smartphone', 'Laptop', 'AI', 'Robots', 'Gaming',
        'Headphone', 'PC', 'Computers', 'Apps', 'App', 'Phones', 'Phone',
        'Smart', 'Android', 'iOS', 'Snapdragon', 'Qualcom', 'Intel',
        'Startups', 'Startups', 'smartwatch', 'Realme', 'Redmi', 'Bluetooth',
        'Camera'
    ]
    cc = 0
    ff = 0
    for i in range(0, len(count) - 1):
        de = count[i][1]
        if (de == 1):
            ff = ff + 1
        else:
            cc = cc + de
    score = 0
    b = 0
    t = 0
    for i in range(0, len(count) - 1):
        a = count[i][0]
        b = count[i][1]
        for k in tech:
            if (a == k):
                print(a)
                b = b / cc
                t = t + b

                print(b)
                score = score + 1
    fp = count[0][1]
    xy = fp / cc
    if (b > xy and score > 7):
        return 1
    else:
        return 0
Exemple #53
0
# use nltk separate the sentences
from nltk.tokenize import sent_tokenize
parag = "My name is roky . i like nltk . i like python"
print(sent_tokenize(parag))
myArr = sent_tokenize(parag)
print(myArr)
print(myArr[2])
print(word_tokenize(data))

# <a id="551"></a> <br>
# All of them are words except the comma. Special characters are treated as separate tokens.
# 
# ## 5-5-1 Tokenizing sentences
# The same principle can be applied to sentences. Simply change the to sent_tokenize()
# We have added two sentences to the variable data:

# In[ ]:


from nltk.tokenize import sent_tokenize, word_tokenize
 
data = "All work and no play makes jack dull boy. All work and no play makes jack a dull boy."
print(sent_tokenize(data))

# <a id="552"></a> <br>
# ## 5-5-2 NLTK and arrays
# If you wish to you can store the words and sentences in arrays

# In[ ]:


from nltk.tokenize import sent_tokenize, word_tokenize
 
data = "All work and no play makes jack dull boy. All work and no play makes jack a dull boy."
 
phrases = sent_tokenize(data)
words = word_tokenize(data)
 
             ":) and :D",     # emoticons handled
             "",              # an empty string is correctly handled
             "Today sux",     #  negative slang handled
             "Today sux!",    #  negative slang with punctuation emphasis handled
             "Today SUX!",    #  negative slang with capitalization emphasis
             "Today kinda sux! But I'll get by, lol" # mixed sentiment example with slang and constrastive conjunction "but"
             ]


paragraph = "It was one of the worst movies I've seen, despite good reviews. \
Unbelievably bad acting!! Poor direction. VERY poor production. \
The movie was bad. Very bad movie. VERY bad movie. VERY BAD movie. VERY BAD movie!"


from nltk import tokenize
lines_list = tokenize.sent_tokenize(paragraph)
sentences.extend(lines_list)

tricky_sentences = [
        "Most automated sentiment analysis tools are shit.",
        "VADER sentiment analysis is the shit.",
        "Sentiment analysis has never been good.",
        "Sentiment analysis with VADER has never been this good.",
        "Warren Beatty has never been so entertaining.",
        "I won't say that the movie is astounding and I wouldn't claim that \
        the movie is too banal either.",
        "I like to hate Michael Bay films, but I couldn't fault this one",
        "It's one thing to watch an Uwe Boll film, but another thing entirely \
        to pay for it",
        "The movie was too good",
        "This movie was actually neither that funny, nor super witty.",
Exemple #56
0
print("*****************Data with no stop words*****************")
print(text_NoStopWords)

# Deleting puntuation marks from the text.file
punct = set(string.punctuation)
text_NoPunct = ''.join(x for x in text_NoStopWords if x not in punct)
print("*******************Data without punctuation******************")
print(text_NoPunct)

# Step 3 & 4: Removing verbs from the text by applying word tokenizing and POS
words = word_tokenize(text_NoPunct)
tokes_pos = nltk.pos_tag(words)
for i in tokes_pos:
    if 'VB' not in i[1]:
        words_NoVerbs.append(i[0])
print("*****************Data with no verbs************")
print(words_NoVerbs)

# Step 5 & 6: Fetching the top 5 most occurring words
counts = Counter(words_NoVerbs).most_common(5)
print("************Most repeated words in the tetx file************")
print(counts)

# Steps 7 to 10: Concatenating and printing the statements containing the most frequent words
for top in counts:
    for sent in sent_tokenize(data.lower()):
        if sent not in text_res:
            if top[0] in word_tokenize(sent):
                text_res = text_res + sent
print("**************Final text************")
print(text_res)
    def annotate(self, essay_text):

        try:
            sentences = sent_tokenize(essay_text.strip())
            contents = "\n".join(sentences)

            essay = Essay(full_path=None,
                          include_vague=self.config["include_vague"],
                          include_normal=self.config["include_normal"],
                          load_annotations=False,
                          essay_text=contents)

            processed_essays = process_essays(
                essays=[essay],
                spelling_corrector=self.spelling_corrector,
                wd_sent_freq=self.wd_sent_freq,
                remove_infrequent=self.config["remove_infrequent"],
                spelling_correct=self.config["spelling_correct"],
                replace_nums=self.config["replace_nums"],
                stem=self.config["stem"],
                remove_stop_words=self.config["remove_stop_words"],
                remove_punctuation=self.config["remove_punctuation"],
                lower_case=self.config["lower_case"])

            self.logger.info("Essay loaded successfully")
            essays_TD = self.feature_extractor.transform(processed_essays)

            wd_feats, _ = flatten_to_wordlevel_feat_tags(essays_TD)
            xs = self.feature_transformer.transform(wd_feats)

            wd_predictions_by_code = test_classifier_per_code(
                xs, self.tag_2_wd_classifier, self.wd_test_tags)

            dummy_wd_td_ys_bytag = defaultdict(
                lambda: np.asarray([0.0] * xs.shape[0]))
            sent_xs, sent_ys_bycode = get_sent_feature_for_stacking_from_tagging_model(
                self.sent_input_feat_tags,
                self.sent_input_interaction_tags,
                essays_TD,
                xs,
                dummy_wd_td_ys_bytag,
                self.tag_2_wd_classifier,
                sparse=True,
                look_back=0)
            """ Test Stack Classifier """

            sent_predictions_by_code = test_classifier_per_code(
                sent_xs, self.tag_2_sent_classifier,
                self.sent_output_train_test_tags)
            """ Generate Return Values """
            essay_tags = self.__get_essay_tags_(sent_predictions_by_code)

            essay_type = None
            if "coral" in self.essays_folder.lower():
                essay_type = "CB"
            elif "skin" in self.essays_folder.lower():
                essay_type = "SC"
            else:
                raise Exception("Unknown essay type")

            raw_essay_tags = ",".join(sorted(essay_tags, key=cr_sort_key))

            t_words = self.__get_tagged_words_(essay, essays_TD[0],
                                               wd_predictions_by_code)
            t_sentences = self.__get_tagged_sentences_(
                essay, sent_predictions_by_code)

            tagged_sentences = [
                t_sent.add_word_tags(map(lambda twd: twd.__dict__,
                                         t_wds)).__dict__
                for t_sent, t_wds in zip(t_sentences, t_words)
            ]

            essay_codes, essay_causal = self.__format_essay_tags_(essay_tags)
            return {
                "tagged_sentences": tagged_sentences,
                "essay_codes": essay_codes,
                "essay_causal": essay_causal,
                "essay_category": essay_category(raw_essay_tags, essay_type),
                "raw_essay_tags": raw_essay_tags
            }
        except Exception as x:
            self.logger.exception(
                "An exception occured while annotating essay")
            return {"error": format_exc()}
        pass
Exemple #58
0
def summarize(text, target_sentences=3):
    """
  Given all the text in a page, determine a number of summarizing sentences.
  """
    def page_rank(G, s=.85, maxerr=.001):
        G = np.array(G)

        n = G.shape[0]

        # transform G into markov matrix M
        M = csc_matrix(G, dtype=np.float)
        rsums = np.array(M.sum(1))[:, 0]
        ri, ci = M.nonzero()
        M.data /= rsums[ri]

        # bool array of sink states
        sink = rsums == 0

        # Compute pagerank r until we converge
        ro, r = np.zeros(n), np.ones(n)
        while np.sum(np.abs(r - ro)) > maxerr:
            ro = r.copy()
            # calculate each pagerank at a time
            for i in xrange(0, n):
                # inlinks of state i
                Ii = np.array(M[:, i].todense())[:, 0]
                # account for sink states
                Si = sink / float(n)
                # account for teleportation to state i
                Ti = np.ones(n) / float(n)

                r[i] = ro.dot(Ii * s + Si * s + Ti * (1 - s))

        # return normalized pagerank
        return r / sum(r)

    def vec_tfidf(sent):
        words = map(lambda s: s.lower(), nltk.word_tokenize(sent))
        counts = {}
        for w in words:
            if w not in counts:
                counts[w] = 0

            counts[w] += 1

        max_count = max(counts.values())

        tfidf = {}
        for word in counts.keys():
            tfidf[word] = (0.5 + 0.5 * counts[word] / max_count)
            tfidf[word] *= math.log(1 + brown_sent_count /
                                    total_word_counts.get(word, 1))

        return tfidf

    def cos_similarity(vec1, vec2):
        num = 0
        denom = sum(vec1.values()) * sum(vec2.values())
        for e in vec1.keys():
            if e in vec2:
                num += vec1[e] * vec2[e]

        return 1.0 * num / denom

    tfidf = []
    sentences = tokenize.sent_tokenize(text)
    for sent in sentences:
        tfidf.append(vec_tfidf(sent))

    matrix = []
    for v1 in tfidf:
        row = []
        for v2 in tfidf:
            row.append(cos_similarity(v1, v2))
        matrix.append(row)

    scores = page_rank(matrix)

    return [
        sentences[i]
        for i in sorted(range(len(scores)), key=lambda x: -scores[x])
    ][:target_sentences]
Exemple #59
0
            "dropout_keep_prob").outputs[0]

        # Tensors we want to evaluate
        predictions = graph.get_operation_by_name(
            "output/predictions").outputs[0]

        # Generate batches for one epoch
        batches = dh.batch_iter(list(x_test),
                                FLAGS.batch_size,
                                1,
                                shuffle=False)

        all_predictions = []

        for x_test_batch in batches:
            batch_predictions = sess.run(predictions, {
                input_x: x_test_batch,
                dropout_keep_prob: 1.0
            })
            all_predictions = np.concatenate(
                [all_predictions, batch_predictions])

        ret = dict()
        ret['results'] = list()
        pred_sentence = sent_tokenize(inputEssay)
        for i in range(len(all_predictions)):
            if (all_predictions[i] == 1):
                ret['results'].append(pred_sentence[i])
        print(json.dumps(ret))
# print(inputEssay)
from nltk.tokenize import sent_tokenize, word_tokenize
example = "Hello My name is Muhammad Aashir And i lives in multan, pakistan "
ab = sent_tokenize(example)
print('This is Sent Tokenizer = ', ab)