def process_statuses(uid): statuses_list = {} in_path = 'Data/'+uid+'/statuses_list.pickle' if os.path.exists(in_path): f = open(in_path,'rb') j = 0 while True: try: statuses = pickle.load(f) for status in statuses: j += 1 tweet = status.text if sents = sent_tokenize(tweet) text = "" for sent in sents: #print("Sent: ", sent) sent_text = re.sub(r'RT\s@\w+:\s|@\w+\s|#|http://.*$|http://.*\s|https://.*$|https://.*\s|\n|\\U\w+', "", sent) sent_text = highpoints.sub("", sent_text) #print(sent_text) tokens = word_tokenize(sent_text) words = [w.lower() for w in tokens if w.isalpha() or w.isalnum()] stop_words = set(stopwords.words('english')) filtered_words = [w for w in words if not w in stop_words] statuses_list[sent] = filtered_words #structure: key:integrate sentence, value: filtered_words except EOFError: print(j) break #print("statuses_list: ", statuses_list) return statuses_list
def embed(sentences): model = word2vec.load('~/word2vec_models/GoogleNews-vectors-negative300.bin') embedded_sentences = [] tokenized_sentences = [] max_len = 0 for sentence in sentences: tokenized_sentence = sent_tokenize(sentence) tokenized_sentences.append(tokenized_sentence) if len(tokenized_sentence) > max_len: max_len = len(tokenized_sentence) for sentence in sentences: tokenized_sentence = sent_tokenize(sentence) embedded_words = [] for word in tokenized_sentence: try: word = model['word'] except: word = np.zeros(300) embedded_words.append(word) #padding for i in range(max_len - len(embedded_words)): embedded_words.append(np.zeros(300)) embedded_sentences.append(embedded_words) embedded_sentences = np.array(embedded_sentences) return embedded_sentences
def sentences(a, b): """Return sentences in both a and b""" asplit = sent_tokenize(a) bsplit = sent_tokenize(b) # use set again same = {x for x in asplit if x in bsplit} return list(same)
def load_file_sentences(filepath): index = filepath.rfind('/') if index < 0: sents = sent_tokenize(PlaintextCorpusReader('.', filepath).raw()) else: sents = sent_tokenize(PlaintextCorpusReader(filepath[:index], filepath[index+1:]).raw()) return sents
def realtime(): model_parsing() data_df=pd.read_csv('Test_Survey.csv') data_df.Verbatim=data_df.Verbatim.fillna(0) unique_id=data_df['Unique_Id'] verbatims=data_df['Verbatim'] data_dict = dict(zip(unique_id, verbatims)) Results_df=pd.DataFrame(columns=('Unique_id','Sentence', 'category', 'Sentiment')) model_df = pd.read_csv('Model_modified_twitter_test.csv') for uid,line in data_dict.items(): line=str(line).decode('utf-8',errors='ignore') #To make sure program doesnt run into unicode error. Add errot handling to avoid issues with other formats try: line_list=tokenize.sent_tokenize(str(line)) tokenize.sent_tokenize(str(line)) for line in line_list: original_line=line for p in list(punctuation): line=line.replace(p,'') line=line.lower() line_SC=tb.blob.BaseBlob(line) line=line_SC.correct() line=str(line) #print uid sentiment_score=sentiment_calc(line) temp_df=core_classify(line,uid,sentiment_score,model_df,original_line) #Results_df = Results_df.append(temp_df) yield temp_df except UnicodeEncodeError: temp_df = pd.DataFrame({'Unique_id':[uid],'Sentence':[original_line],'category':['Invalid text data'],'Sentiment':[sentiment_score]}) yield temp_df #Results_df = Results_df.append(temp_df) Results_df.to_csv('test_analysis.csv',index=False, encoding = 'utf-8')
def split_sentence_based_on_rules(sent): if re.search(r' \.+ ', sent): sentences = re.split(r' \.+ ', sent) elif re.search(r'@ ---- @', sent): sentences = re.split(r'@ ---- @', sent) elif re.search(r'\.\w+\:', sent): sent = re.sub(r'\.(\w+)\:', r'. \1:', sent) sentences = sent_tokenize(sent) elif re.search(r'\, as well as', sent): sent = sent.replace(', as well as', '. As well as') sentences = sent_tokenize(sent) elif re.search(r'[a-z\.]+[A-Z][a-z]+:', sent): k = re.findall(r' [a-z\.]+([A-Z][a-z]+:)', sent) p = chr(ord(max(sent)) + 1) sentences = sent.replace(k[0], p + k[0]).split(p) elif re.search(r'\; ', sent): sent = re.sub(r'\; ', r'. ', sent) sentences = sent_tokenize(sent) elif re.search(r', and, ', sent): sent = sent.replace(', and, ', '. And, ') sentences = sent_tokenize(sent) elif re.search(r'president\: Wechsler', sent): sent = sent.replace(': ', '. ') sentences = sent_tokenize(sent) elif re.search(r'\, ', sent): sentences = re.split(r'\, ', sent) else: sentences = [sent[:349],sent[350:]] print("Using greedy sentence tokenization") text_len = [len(sentence) for sentence in sentences] return sentences
def post(self): args = parser.parse_args() text = {'text': args['text']} print text print sent_tokenize(text['text']) print word_tokenize(text['text']) return text['text']
def inputfactx(rev, include_vpr): this_business = find_business(rev.bizid) this_user = find_user(rev.uid) result = [ this_business.stars ] if include_vpr: result += [ this_user.get_vpr() ] result += [ this_user.reviewCount, len(rev.text), rev.stars, rev.get_days() ] if len(rev.text) == 0: result += [ 0, 0, 0, 0, 0 ] else: excount = 0 for sent in sent_tokenize(rev.text): ss = sent.strip() if ss.endswith('!'): excount += 1 result += [ excount, np.mean([len(sent) for sent in sent_tokenize(rev.text)]), len(sent_tokenize(rev.text)), len(re.findall('\n\n', rev.text)) + 1, len(rev.text.splitlines()[0]) ] result += [ this_business.longitude, this_business.latitude ] return result
def tokenize_sentences(filename): file_dir = docs_dir + str(filename) f = open(file_dir, 'r') root = ET.parse(f).getroot() tags = root.getiterator('str') # read the relevant tags title_string = '' desc_string = '' for tag in tags: if tag.get('name') == 'Title' : title_string = filter(lambda x: x in string.printable, tag.text.lower().strip()) elif tag.get('name') == 'Abstract': desc_string = filter(lambda x: x in string.printable, tag.text.lower().strip().replace('relevant documents will describe', '')) f.close() sentences = sent_tokenize(title_string) title_words = [] for s in sentences: title_words = title_words + word_tokenize(s) sentences = sent_tokenize(desc_string) desc_words = [] for s in sentences: desc_words = desc_words + word_tokenize(s) return (title_words, desc_words)
def split_reddit_reviews(self,reviews): columns = ['Text','Score', 'True'] #Calculate total number of sentences to fill up the data frame count=0 for index,each_review in reviews.iterrows(): split_sentences=sent_tokenize(each_review['Text']) count+=len(split_sentences) print "total number of sentences {}".format(count) df = pd.DataFrame(index=range(0,count), columns=columns) Text,Score,True=[],[],[] for index,each_review in reviews.iterrows(): split_sentences=sent_tokenize(each_review['Text']) actual_tag=each_review['True'] score_tag=each_review['Score'] for each_split_sentence in split_sentences: Text.append(each_split_sentence) Score.append(actual_tag) True.append(score_tag) print "Count ={} Text.length {}".format(count,len(Text)) df['Text']=Text df['Score']=Score df['True']=True df.to_csv('../data/reddit_reviews.csv')
def tokenize(text, grams=1): wordStems = lambda s: map(stem, word_tokenize(s)) sentTokens = lambda tok, s: tok + wordStems(s) if grams == 1: return list(reduce(sentTokens, sent_tokenize(text), [ ])) else: return list(ngrams(reduce(sentTokens, sent_tokenize(text), [ ]), grams))
def main(param = 0): ''' 0 for no stem 1 for porter 2 for lancaster ''' both_pos_index = {} tit_pos_index = {} abs_pos_index = {} if param == 0: path = './NoStemmer/' elif param == 1: path = './Porter/' elif param == 2: path = './Lancaster/' for i in range(1,1001): '''open xml file and get abstract and title''' try: filename = "./data/%d.xml" %i data = open(filename) except: print "can't open file %s" %filename return 0 docid = filename.split('/')[-1].split('.')[-2] tree = etree.fromstring(data.read()) title = tree.find('Title').text abstract = tree.find('Abstract').text ##################################################### # Step2 tokenize and make position index dictionary # ##################################################### '''sentence tokenize''' if title != None: title = title.replace('[','',1).replace(']','',1) titles = [s.replace('&', '') for s in sent_tokenize(title)] tit_pos_index = position_index(tit_pos_index, titles, docid, param) if abstract != None: abstracts = [s.replace('&', '&') for s in sent_tokenize(abstract)] both = titles + abstracts else: both = titles both_pos_index = position_index(both_pos_index,both,docid, param) '''save position idex to json''' with codecs.open( './' + path.split('/')[1] + '_both_index' + '.json', mode = 'w') as a: json.dump(both_pos_index, a)
def tag_words_by_sentence(input_filename, output_path=''): # text = get_file_text(input_filename) text = 'Every day I see blue. But the sky is red. Eagles are green' sentences = sent_tokenize(text) # sentences = sent_tokenize(text) word_tokens = [word_tokenize(s) for s in sent_tokenize(text)] # word_tokens = nltk.tag.batch_pos_tag(sent_tokenize(text)) word_pos = nltk.tag.batch_pos_tag(word_tokens) return
def sentences(a, b): """Return sentences in both a and b""" a1 = set(sent_tokenize(a)) b1 = set(sent_tokenize(b)) ans = [] for line in a1: if line in b1: ans.append(line) return ans
def lexical_features(self): """ Lexical features """ features = [] # Add the first token from the top-1st span on stack if self.stackspan1 is not None: text = self.stackspan1.text texts1 = word_tokenize(text) # print texts1 sent_tokenize_list =sent_tokenize(text) wordb = word_tokenize(sent_tokenize_list[0] ) worde = word_tokenize(sent_tokenize_list[-1] ) # print wordb[0] features.append(('StackSpan1','BEGIN-WORD-STACK1',wordb[0].lower())) features.append(('StackSpan1','BEGIN-END-STACK1',worde[-1].lower())) features.append(('StackSpan1','BEGIN-END-WORD-STACK1',wordb[0].lower(),worde[-1].lower())) if self.stackspan2 is not None: text = self.stackspan2.text texts2 = word_tokenize(text) # print texts1 sent_tokenize_list =sent_tokenize(text) wordb = word_tokenize(sent_tokenize_list[0] ) worde = word_tokenize(sent_tokenize_list[-1] ) # print wordb[0] features.append(('StackSpan2','BEGIN-WORD-STACK2',wordb[0].lower())) features.append(('StackSpan2','BEGIN-END-STACK2',worde[-1].lower())) if self.queuespan1 is not None: text = self.queuespan1.text textq1 = word_tokenize(text) # print texts1 sent_tokenize_list =sent_tokenize(text) wordb = word_tokenize(sent_tokenize_list[0] ) worde = word_tokenize(sent_tokenize_list[-1] ) # print wordb[0] features.append(('QueueSpan1','BEGIN-WORD-QUEUE1',wordb[0].lower())) features.append(('QueueSpan1','BEGIN-END-QUEUE',worde[-1].lower())) features.append(('QueueSpan1','BEGIN-END-WORD-QUEUE1',wordb[0].lower(),worde[-1].lower())) if self.stackspan2 is not None and self.stackspan1 is not None: features.append(('StackSpan1','LENGTH-STACK1-STACK2',len(texts1),len(texts2))) if self.queuespan1 is not None and self.stackspan1 is not None : features.append(('StackSpan1','LENGTH-STACK1-QUEUE1',len(texts1),len(textq1))) # features.append(('StackSpan1','POS-START-STACK1-QUEUE1',begins1,beginq1)) for feat in features: yield feat
def parse(body): contents = [] if isinstance(body, basestring): contents.append(body) else: contents = body sentences = [] for content in contents: sentences.extend([sentence for sentence in sent_tokenize(content) if not str_helper.hasHTMLTag(sentence)]) stop = stopword.get_stopwords() tokens = {} for sentence in sentences: for word in word_tokenize(sentence.lower()): if word not in stop and not str_helper.hasNumbers(word) and not str_helper.hasPunctuation(word): word = stem.stemming(word) tokens.setdefault(word, 0) tokens[word] += 1 wp = pos_tag(tokens.keys()) words = [row[0] for row in wp] tags = [row[1] for row in wp] return words, tags
def line_to_sentences(line): raw_sentences = sent_tokenize(line.strip()) sentences = [] for raw_sentence in raw_sentences: if len(raw_sentence) > 0: sentences.append(w2v_normalize(raw_sentence)) return sentences
def __init__(self, content, remove_punct=True): self._tokcont = [word_tokenize(s) for s in sent_tokenize(content)] if remove_punct: self._tokcont = [[w for w in s if w not in punctuation] for s in self._tokcont[:]] # Remove zero-length sentence self._tokcont = [s for s in self._tokcont[:] if len(s) > 0]
def markovize(word1, word2, word3, fileid, char_limit=None): with open(fileid, encoding='utf-8') as f: text = f.read() sentences = sent_tokenize(text) sent_tokens = defaultdict(list) for sentence in sentences: tokens = re.findall(r"[\w']+|[.,?!:;]", sentence) nwise_ = nwise(tokens, n=4) if nwise_: for token1, token2, token3, token4 in nwise_: sent_tokens[token1, token2, token3].append(token4) too_long = True while too_long: sentence = [word1, word2, word3] utterance = build_sentence(sentence, sent_tokens) len_utterance = len(utterance) if char_limit != None and len_utterance > char_limit: too_long = True else: too_long = False return utterance
def split_sentence_from_document(document): max_counts = 0 for sent in tokenize.sent_tokenize(document): max_counts = max(max_counts, len(tokenize.wordpunct_tokenize(sent))) # if max_counts>4000: # print(document) return max_counts
def map(self, story): result = [] print "parsing: %s" % story # load the grammer file # note, the atis grammar will only work with the atis sample sentences atis_grammar = nltk.data.load('file:/roger/nltk_data/grammars/large_grammars/atis.cfg') # create a new parser with the grammar parser = nltk.ChartParser(atis_grammar) # split the story into sentence tokens sentence_tokens = sent_tokenize(story) for sentence_token in sentence_tokens: # split each sentence in to word tokens word_tokens = word_tokenize(sentence_token) # map each possible sentence structure for tree in parser.parse(word_tokens): print tree result.append(tree) return result;
def processKeywordSearch(self): searchString = self._args[0] while True: article = self._taskQueue.get() if article == END_OF_QUEUE: break else: articlePathPartList = article['filePath'].split('/') articleCompanyCode = articlePathPartList[-3] if articlePathPartList[-2] == 'a' else articlePathPartList[-2] articleCompany = self._db.getCompanyByCode(articleCompanyCode) articleCompanyName = articleCompanyCode if articleCompany is None else articleCompany['name'] articleSentenceList = [] #here, use '|' to combine regex is OK, because sentence is short, will not reduce the performance that much. #But in DB search, use iterative way. pattern = getPatternByKeywordSearchString(searchString) #on sentence level first, if can't find, go to paragraph level. for paragraph in [article['headline'], article['byline'], article['leadParagraph'], article['tailParagraph']]: sentenceList = sent_tokenize(paragraph) for sentence in sentenceList: if re.search(pattern, sentence) is not None: articleSentenceList.append(sentence.encode('utf-8').strip()) if not articleSentenceList: #search on paragraph level for paragraph in [article['headline'], article['byline'], article['leadParagraph'], article['tailParagraph']]: if re.search(pattern, paragraph) is not None: articleSentenceList.append(paragraph.encode('utf-8').strip()) lineList = [articleCompanyCode, articleCompanyName, article['filePath'], article['_id'], article['date'], article['sourceName'].strip(), article['byline'].strip(), article['headline'].strip(), '\t'.join(articleSentenceList)] self._resultQueue.put(lineList)
def get_stemmed_separate(indeed_reviews_db, glassdoor_reviews_db): separate = get_separate_reviews(indeed_reviews_db, glassdoor_reviews_db) stemmer = PorterStemmer() stemmed_reviews = [] for review in separate: stemmed_reviews.append(' '.join([stemmer.stem(word) for sent in sent_tokenize(review) for word in word_tokenize(sent.lower())])) return stemmed_reviews
def __call__(self): '''tokenize sentences, lower cases, replace digits''' text = file(self.inputFile).read().lower() text = filter(lambda x: x in printable, text) ## remove non-ascii characters sent_tokenize_list = sent_tokenize(text.strip().lower(), "english") ## tokenize documents into sentences, lower case for sent_idx in xrange(len(sent_tokenize_list)): updated_sent = [] ## a modified sentence sent_tokenize_list[sent_idx] = sent_tokenize_list[sent_idx].translate(replace_punctuation) ## remove all punctuation sent_tokenize_list[sent_idx] = TreebankWordTokenizer().tokenize(sent_tokenize_list[sent_idx]) ## sent_tokenize_list[sent_idx] is a list of unigrams now term_idx = 0 sentLen = len(sent_tokenize_list[sent_idx]) while term_idx<sentLen: flag = 1 curr_term = sent_tokenize_list[sent_idx][term_idx] if mesh_phrase_idx.get(curr_term): maxPhraseLen = mesh_phrase_idx.get(curr_term) ## the maximum length of phrase starting with the current term for n in xrange(maxPhraseLen,1,-1): ## iterate from n to 2 curr_n_gram = " ".join(sent_tokenize_list[sent_idx][term_idx:min(term_idx+n, sentLen)]) if mesh_phrase_dict.get(curr_n_gram): updated_sent.append(mesh_phrase_dict.get(curr_n_gram)) term_idx+=n # move the pointer flag = 0 break if flag: updated_sent.append(curr_term) term_idx+=1 else: updated_sent.append(curr_term) term_idx+=1 sent_tokenize_list[sent_idx] = re.sub(r"\b\d+\b", " ", " ".join(updated_sent))## replace isolated digits self.__save__(sent_tokenize_list)
def tokenize(self, document): """ Break text into sentences and each sentence into a list of single words Ignore any token that falls into the stopwords set. """ # use sentence tokenizer sent_tokenize from nltk package sentences = sent_tokenize(utils.to_unicode(document.lower())) # create stemmer of class SnowballStemmer stemmer = SnowballStemmer("english") for sentence in sentences: words = [word for word in utils.tokenize( self.cleanse_text(sentence) )] if self.remove_stopwords: words = [ word for word in words if word not in self.en_stopwords ] if self.stemming: words = [stemmer.stem(t) for t in words] yield words
def stat_reviews(reviews): """ :type reviews: list[Review] :param reviews: """ tokenizer = nltk.tokenize.RegexpTokenizer(r'\w+') stats = np.zeros(5) num_reviews = len(reviews) for review in reviews: text = review.text num_sentences = len(tokenize.sent_tokenize(text)) num_words = len(tokenizer.tokenize(text.lower())) tagged_words = review.tagged_words tags_count = Counter(tag for word, tag in tagged_words) num_past_verbs = float(tags_count['VBD']) num_verbs = tags_count['VB'] + tags_count['VBD'] + tags_count['VBG'] +\ tags_count['VBN'] + tags_count['VBP'] + tags_count['VBZ'] ratio = (num_past_verbs + 1) / (num_verbs + 1) stats[0] += num_sentences stats[1] += num_words stats[2] += num_past_verbs stats[3] += num_verbs stats[4] += ratio for index in range(len(stats)): stats[index] /= num_reviews print('Average sentences:', stats[0]) print('Average words:', stats[1]) print('Average past verbs:', stats[2]) print('Average verbs:', stats[3]) print('Average past verbs ratio:', stats[4])
def extract_post_features(self, post): #Assume post consists of one string post_features = [0, {}, {}] sent_tokenized = sent_tokenize(post) sent_word_tokenized = [word_tokenize(s) for s in sent_tokenized] if self.ngram_features: ngrams = len(self.ngram_features.keys()[0]) for sentence in sent_word_tokenized: if self.ngram_features: ngramsentence = find_ngrams(sentence, ngrams) for ngram in ngramsentence: if ngram in self.ngram_features.keys(): if ngram in post_features[2]: post_features[2][ngram] += 1 else: post_features[2][ngram] = 1 for token in sentence: if token in self.word_features.keys(): if token in post_features[1]: post_features[1][token] += 1 else: post_features[1][token] = 1 modified_list = [(0, 0)] for word, number in self.word_features.iteritems(): if word in post_features[1].keys(): modified_list.append((number, post_features[1][word])) for ngram, number in self.ngram_features.iteritems(): if ngram in post_features[2].keys(): modified_list.append((number, post_features[2][ngram])) # print modified_list return [x[0] for x in modified_list]
def make_sentences(self): """ Makes sentences from raw documents. Each sentence is wrapped up in a sentence class :return: None """ # Create parameters for NER and Dependency Parsing a # and pass it to the sentence objcet # set config file config = CP.RawConfigParser() config = config config.read('config.py') # Server for dependency parsing server = ServerProxy(JsonRpc20(),TransportTcpIp(addr=("127.0.0.1", 8080), timeout=200.0)) # Parameters for Named entitye recognition # get the classifier and tagger location from config file tagger = config.get('NER','tagger') # gets the path of the stanford tagger classifier = config.get('NER','classifier') # gets the path of the stanford classifier st = StanfordNERTagger(classifier,tagger) if self.document == None: return sent = sent_tokenize(self.document) # contains raw sentences for i in range(len(sent)): s = Sentence(sent[i],i, server, st) # We also pass the server object and nertagger self.sentences.append(s)
def missingCorpus(corpusdir): try: os.makedirs(corpusdir) except OSError: if not os.path.isdir(corpusdir): raise try: os.makedirs(corpusdir+'/ratings') except OSError: if not os.path.isdir(corpusdir+'/ratings'): raise hotel = json.load(open(data_path+file)) stopset = hotelNameAddress(hotel) stopgroup = "" for e in stopset: stopgroup += e+" " stopgroup = stopgroup[0:-1] with open(corpusdir+'/stopset.txt', 'w') as fout: fout.write(stopgroup) revNum = 0 for review in hotel.get('Reviews'): revNum += 1 contentOut = "" overall = review.get('Ratings').get('Overall') content = pos_tag_sents([word_tokenize(sentence) for sentence in sent_tokenize(review.get('Content'))]) with open (corpusdir+'/ratings/OverallRating'+str(revNum)+'.txt', 'w') as fout: fout.write(overall) with codecs.open(corpusdir+'/Review'+str(revNum)+'.txt', 'w', encoding = "utf-8") as fout: for sentence in content: for word, pos in sentence: contentOut += word+"/"+pos+" " contentOut += '\n' fout.write(contentOut)
def training_ner(self, paragraph, classification): sentence = sent_tokenize(paragraph) #print paragraph #result = [] train = [] sentence_ne = "" # 1. Pemecahan paragraf kedalam kalimat for index, data in enumerate(sentence): tokenize = word_tokenize(data) div_sentence = [] for word in tokenize: #check_kota = len(list(self.db.cities.find({"kota":re.compile("^"+word+"$", re.IGNORECASE)})))>=1 check_kota = (self.db.location.find({"$text": {"$search": word.lower()}}).count())>=1 # print "word : %s, check : %s"%(word,check_kota) if not check_kota: #apabila kata bukan kota maka dibuat kata dasar sent_stem = self.stemmer.stem(word) word = sent_stem div_sentence.append(word) train.append(" ".join(div_sentence)) #ket parameter : self.div_sentence_ner(kalimat_dengan_kata_dasar, kalimat_asli, jenis_klasifikasi) sentence_ne = self.div_sentence_ner("".join(train), " ".join(tokenize), classification) #result.append(sentence_ne) #reset array train agar tidak diikutkan training ner train = [] return sentence_ne
def load_dataset(args, dataset_name, sup_source, num_seed_doc=10, common_words=10000, truncate_doc_len=None, truncate_sent_len=None, with_eval=True): data_path = './' + dataset_name data, y, class_tree = read_file(dataset_name, with_eval=with_eval) np.random.seed(1234) data = preprocess_doc(data) data = [s.split(" ") for s in data] trun_data = [s[:truncate_doc_len] for s in data] tmp_list = [len(doc) for doc in data] len_max = max(tmp_list) len_avg = np.average(tmp_list) len_std = np.std(tmp_list) print("\n### Dataset statistics - Documents: ###") print(f'Document max length: {len_max} (words)') print(f'Document average length: {len_avg} (words)') print(f'Document length std: {len_std} (words)') if truncate_doc_len is None: truncate_doc_len = min(int(len_avg + 3 * len_std), len_max) print(f"Defined maximum document length: {truncate_doc_len} (words)") print( f'Fraction of truncated documents: {sum(tmp > truncate_doc_len for tmp in tmp_list) / len(tmp_list)}' ) sequences_padded = pad_docs(trun_data, pad_len=truncate_doc_len) word_counts, vocabulary, vocabulary_inv, trim_vocabulary = build_vocab( sequences_padded, common_words) print(f"Vocabulary Size: {len(vocabulary_inv):d}") x = build_input_data(sequences_padded, vocabulary) x = np.array(x) assign_data_to_nodes(args, x, y, class_tree) # Prepare sentences for training LSTM language model trun_data = [" ".join(doc) for doc in trun_data] flat_data = [tokenize.sent_tokenize(doc) for doc in trun_data] flat_data = [sent for doc in flat_data for sent in doc] flat_data = [sent for sent in flat_data if len(sent.split(" ")) > 5] tmp_list = [len(sent.split(" ")) for sent in flat_data] max_sent_len = max(tmp_list) avg_sent_len = np.average(tmp_list) std_sent_len = np.std(tmp_list) if truncate_sent_len is None: truncate_sent_len = min(int(avg_sent_len + 3 * std_sent_len), max_sent_len) print("\n### Dataset statistics - Sentences: ###") print(f'Sentence max length: {max_sent_len} (words)') print(f'Sentence average length: {avg_sent_len} (words)') print(f"Defined maximum sentence length: {truncate_sent_len} (words)") print( f'Fraction of truncated sentences: {sum(tmp > truncate_sent_len for tmp in tmp_list) / len(tmp_list)}' ) flat_data = [s.split(" ") for s in flat_data] sequences = build_sequence(flat_data, trim_vocabulary, truncate_sent_len) perm = np.random.permutation(len(x)) if sup_source == 'keywords': load_keywords(data_path, class_tree) elif sup_source == 'docs': if dataset_name == 'yelp': class_type = 'sentiment' num_keywords = 5 else: class_type = 'topic' num_keywords = 10 extract_keywords(data_path, class_tree, class_type, vocabulary, num_seed_doc, num_keywords, data, perm) x = x[perm] if y is not None: if type(y) == dict: inv_perm = {k: v for v, k in enumerate(perm)} perm_y = {} for doc_id in y: perm_y[inv_perm[doc_id]] = y[doc_id] y = perm_y else: y = y[perm] return x, y, sequences, class_tree, word_counts, vocabulary, vocabulary_inv, len_avg, len_std, perm
def sentenceTokenizing(self, sentence): print(sent_tokenize(sentence))
#import the modules required. from nltk.corpus import stopwords import string from sklearn.feature_extraction.text import CountVectorizer from nltk.tokenize import sent_tokenize, word_tokenize #Load the corpus from a text file and tokenize it into sentences. with open('matter.txt', 'r') as f: data = f.read() Text = sent_tokenize(data) # Total number of sentences in the data. Prints 14 for this text. print(len(Text)) #Define the preprocessor routine for the data. def preprocess(sentence): sentence = sentence.lower() sentence = "".join([x for x in sentence if x not in string.punctuation]) sentence = [ x for x in sentence.split(" ") if x not in stopwords.words('english') ] sentence = [x for x in sentence if x != ''] return " ".join(sentence) # Fit a bag of words estimator and transform the count matrix. bow_vectorizer = CountVectorizer(lowercase=True, preprocessor=preprocess) model = bow_vectorizer.fit(Text) bag_of_words = model.transform(Text) #Get the frequencies of the words.
return b f = open("sample.txt", "r") data = f.read() # print(data) d1 = data.replace("?", "? ") d2 = d1.replace('.', '. ') data = d2.replace('!', '! ') data = data.replace("\n", ' ') # print(data) sent_tokenize_list = sent_tokenize(data) print(sent_tokenize_list) f = open('sentence.txt', 'w') for i in sent_tokenize_list: f.write(i) f.write("\n") no_of_sents = len(sent_tokenize_list) table = data.maketrans('', '', string.punctuation) for i in range(no_of_sents): sent_tokenize_list[i] = sent_tokenize_list[i].translate(table)
def tokenize_sentence(file_str): """Return a List of Tokens""" sent_tokenize_list = sent_tokenize(file_str) for i in range(0, 1): word_token = word_tokenize(sent_tokenize_list[i]) return word_token
with open(sys.argv[1], "rb") as csvfile: city = csv.reader(csvfile, delimiter=',', quotechar='"') counter = 0 count2 = 0 for row in city: if counter == 0: inner_counter = 0 for f in row: fields[f.strip()] = inner_counter inner_counter += 1 else: try: if len(row[fields['neighbourhood_cleansed']]) > 1: n = row[fields['neighbourhood_cleansed']].strip().lower() d = row[fields['description']] s = sent_tokenize(d) for t in s: search = '%s is' % (n) n_idx = t.lower().find(search) if n_idx > -1: tags = pos_tag(word_tokenize(t[n_idx:])) for tag in tags: if tag[1] == 'JJ': if n in ngh_adjs: ngh_adjs[n].append(tag[0]) else: ngh_adjs[n] = [tag[0]] except: pass counter += 1
for line in o: while (nomeCien_re.search(line)): line = re.sub(nomeCien_re, ' (taxonomia) ', line) while (Universidade_re.search(line)): line = re.sub(Universidade_re, ' (Universidade) ', line) while (refe_re.search(line)): line = re.sub(refe_re, ' ', line) while (num_re.search(line)): line = re.sub(num_re, ' (numeros) ', line) result = ([ word_tokenize(t, 'portuguese') for t in sent_tokenize(line, 'portuguese') ]) sentencas = [] chunks = [] for sent in result: chunks = [] for word, tag in tagger2.tag(sent): if (word == 'da' or word == 'das' or word == 'do' or word == 'dos' or word == 'na' or word == 'nas' or word == 'no' or word == 'nos'): tag = 'PREP' chunk = (word + '/' + tag) chunks.append(chunk) sentencas.append(chunks)
import os if __name__ == '__main__': initial_time = time() corruptedFiles = [] sentencesCount = 0 processedFiles = 0 for path in os.listdir('MCE-corpus'): try: parser = etree.XMLParser(recover=True) tree = etree.parse('MCE-corpus/' + path, parser) root = tree.getroot() sentences = sent_tokenize(root.find('body').text) sentencesCount += len(sentences) processedFiles += 1 except: corruptedFiles.append(path) final_time = time() print('The files has been processed.') print('Average number of sentences per comment: ' + str(round(sentencesCount / processedFiles, 2))) if len(corruptedFiles) > 0: print('The following files cannnot be processed: \n') for x in corruptedFiles: print(x)
# Вычисляем Rouge rouge = computeRouge(summary, reference) calculate_words(data) # Проверка лучше чем предыдущий rouge? if rouge > prev_rouge and new_li < 100 and all(isLavg) == True: prev_rouge = rouge my_string = "Попытка = {}\t Rouge = {}\t Количество слов = {}\t" print(my_string.format(t, rouge, new_li)) print("Start ...") print("Reading document ...") text = readText("training/AP880310-0257") reference = "Senators McClure (R) and Metzenbaum (D) have sponsored bills to prevent plastic guns from slipping through airport security. The gun, not yet manufactured, is intended for military and police use. Metzenbaum's bill would require some detectable metal content; McClure's would require more sensitive detection equipment at airports, almost certainly causing passenger delays. The NRA opposes the first federal gun ban bill in America, and warns members their guns will soon be inspected and weighed by government agents. However, on this issue they will compromise, not to ban the gun, but to increase airport security. Handgun control advocates and law enforcement officials back Metzenbaum's proposal." sentences = sent_tokenize(text) max_n = len(sentences) l_avg = len(word_tokenize(text)) / len(sentences) # STAGE 1: Случайно количество предложении num_sentence = randomNumberSentence(max_n) print("Нужно {} количество предложении..".format(num_sentence)) vectorizer = CosumTfidfVectorizer() vectorizer.fit(text) vector = vectorizer.weight_matrix print("Вычисляем rouge") # STAGE 2: Вычисливаем rouge
fi = open('Input5.txt', encoding="utf8") strss = "" for line in fi: strss = strss + line #print(strss) class my_struct(): def __init__(self, i, j, sim): self.i = i self.j = j self.sim = sim sents = sent_tokenize(strss) #print(sents) l = len(sents) #print(l) sim = [] for i in range(l): for j in range(l): if (j > i): cal = similarity(sents[i], sents[j], False) sim.append(my_struct(i, j, cal)) #print("%d %d %.3f" %(i, j, cal)) #print("%s\t%s\t%.3f\t%.3f" % (sents[i], sents[j], similarity(sents[i], sents[j], False), similarity(sents[i], sents[j], True))) lsm = len(sim) sim.sort(key=lambda x: x.sim, reverse=True)
tic = time.time() categories = [ "toxic", "severe_toxic", "obscene", "threat", "insult", "identity_hate" ] train = pd.read_csv("train.csv") test = pd.read_csv("test.csv") Y = train[categories].values train["comment_text"].fillna("no comment", inplace=True) train["comment_text"] = train["comment_text"].apply(lambda x: clean_corpus(x)) test["comment_text"].fillna("no comment", inplace=True) test["comment_text"] = test["comment_text"].apply(lambda x: clean_corpus(x)) train["sentences"] = train["comment_text"].apply( lambda x: tokenize.sent_tokenize(x)) test["sentences"] = test["comment_text"].apply( lambda x: tokenize.sent_tokenize(x)) toc = time.time() print(toc - tic) from keras.preprocessing.text import Tokenizer, text_to_word_sequence raw_text = train["comment_text"] tk = Tokenizer(num_words=max_features, lower=True) tk.fit_on_texts(raw_text) def sentenize(data): comments = data["sentences"] sent_matrix = np.zeros((len(comments), max_sent, max_text_len),
def main(args, text): import_user_module(args) if args.buffer_size < 1: args.buffer_size = 1 if args.max_tokens is None and args.max_sentences is None: args.max_sentences = 1 assert not args.sampling or args.nbest == args.beam, \ '--sampling requires --nbest to be equal to --beam' assert not args.max_sentences or args.max_sentences <= args.buffer_size, \ '--max-sentences/--batch-size cannot be larger than --buffer-size' print(args) use_cuda = torch.cuda.is_available() and not args.cpu # Setup task, e.g., translation task = tasks.setup_task(args) # Load ensemble print('| loading model(s) from {}'.format(args.path)) models, _model_args = utils.load_ensemble_for_inference( args.path.split(':'), task, model_arg_overrides=eval(args.model_overrides), ) args.copy_ext_dict = getattr(_model_args, "copy_attention", False) # Set dictionaries src_dict = task.source_dictionary tgt_dict = task.target_dictionary # Optimize ensemble for generation for model in models: model.make_generation_fast_( beamable_mm_beam_size=None if args.no_beamable_mm else args.beam, need_attn=args.print_alignment, ) if args.fp16: model.half() if use_cuda: model.cuda() # Initialize generator generator = task.build_generator(args) # Load alignment dictionary for unknown word replacement # (None if no unknown word replacement, empty if no path to align dictionary) align_dict = utils.load_align_dict(args.replace_unk) if align_dict is None and args.copy_ext_dict: align_dict = {} max_positions = utils.resolve_max_positions( task.max_positions(), *[model.max_positions() for model in models]) if args.buffer_size > 1: print('| Sentence buffer size:', args.buffer_size) #print('| Type the input sentence and press return:') start_id = 0 src_strs = [] results = [] inputs = tokenize.sent_tokenize(text) for batch in make_batches(inputs, args, task, max_positions): src_tokens = batch.src_tokens src_lengths = batch.src_lengths src_strs.extend(batch.src_strs) if use_cuda: src_tokens = src_tokens.cuda() src_lengths = src_lengths.cuda() sample = { 'net_input': { 'src_tokens': src_tokens, 'src_lengths': src_lengths, }, } translations = task.inference_step(generator, models, sample) for i, (id, hypos) in enumerate(zip(batch.ids.tolist(), translations)): src_tokens_i = utils.strip_pad(src_tokens[i], tgt_dict.pad()) results.append((start_id + id, src_tokens_i, hypos)) # sort output to match input order res = '' for id, src_tokens, hypos in sorted(results, key=lambda x: x[0]): if src_dict is not None: src_str = src_dict.string(src_tokens, args.remove_bpe) print('S-{}\t{}'.format(id, src_str)) hypo = hypos[0] _, hypo_str, _ = utils.post_process_prediction( hypo_tokens=hypo['tokens'].int().cpu(), src_str=src_strs[id], alignment=hypo['alignment'].int().cpu() if hypo['alignment'] is not None else None, align_dict=align_dict, tgt_dict=tgt_dict, remove_bpe=args.remove_bpe, ) if id == 0: res = hypo_str else: res = res + ' ' + hypo_str return res
no_of_sentences = len(sentences) pos_tagged_sentence = pos_tagging(sentence) for word in pos_tagged_sentence: if word.lower() not in Stopwords and word not in Stopwords and len(word)>1: word = word.lower() word = wordlemmatizer.lemmatize(word) sentence_score = sentence_score + word_tfidf(dict_freq,word,sentences,sentence) return sentence_score # file = r'C:\Users\Dell\Desktop\FYPr-master\summarized.txt' # file = open(file , 'r', encoding="utf8") # text = file.read() text = sys.argv[1] input_user = int(sys.argv[2]) tokenized_sentence = sent_tokenize(text) text = remove_special_characters(str(text)) text = re.sub(r'\d+', '', text) tokenized_words_with_stopwords = word_tokenize(text) tokenized_words = [word for word in tokenized_words_with_stopwords if word not in Stopwords] tokenized_words = [word for word in tokenized_words if len(word) > 1] tokenized_words = [word.lower() for word in tokenized_words] tokenized_words = lemmatize_words(tokenized_words) word_freq = freq(tokenized_words) #input_user = int(input('Percentage of information to retain(in percent):')) no_of_sentences = int((input_user * len(tokenized_sentence))/100) #print(no_of_sentences) c = 1 sentence_with_importance = {} for sent in tokenized_sentence: sentenceimp = sentence_importance(sent,word_freq,tokenized_sentence)
def getSentences(text): return sent_tokenize(text)
'CZK', 'DJF', 'DKK', 'DOP', 'DZD', 'EGP', 'ERN', 'ETB', 'EUR', 'FJD', 'FKP', 'GBP', 'GEL', 'GHS', 'GIP', 'GMD', 'GNF', 'GTQ', 'GYD', 'HKD', 'HNL', 'HRK', 'HTG', 'HUF', 'IDR', 'ILS', 'INR', 'IQD', 'IRR', 'ISK', 'JMD', 'JOD', 'JPY', 'KES', 'KGS', 'KHR', 'KMF', 'KPW', 'KRW', 'KWD', 'KYD', 'KZT', 'LAK', 'LBP', 'LKR', 'LRD', 'LSL', 'LYD', 'MAD', 'MDL', 'MGA', 'MKD', 'MMK', 'MNT', 'MOP', 'MRU', 'MUR', 'MVR', 'MWK', 'MXN', 'MXV', 'MYR', 'MZN', 'NAD', 'NGN', 'NIO', 'NOK', 'NPR', 'NZD', 'OMR', 'PAB', 'PEN', 'PGK', 'PHP', 'PKR', 'PLN', 'PYG', 'QAR', 'RON', 'RSD', 'RUB', 'RWF', 'SAR', 'SBD', 'SCR', 'SDG', 'SEK', 'SGD', 'SHP', 'SLL', 'SOS', 'SRD', 'SSP', 'STN', 'SVC', 'SYP', 'SZL', 'THB', 'TJS', 'TMT', 'TND', 'TOP', 'TRY', 'TTD', 'TWD', 'TZS', 'UAH', 'UGX', 'USD', 'USN', 'UYI', 'UYU', 'UYW', 'UZS', 'VES', 'VND', 'VUV', 'WST', 'XAF', 'XAG', 'XAU', 'XBA', 'XBB', 'XBC', 'XBD', 'XCD', 'XDR', 'XOF', 'XPD', 'XPF', 'XPT', 'XSU', 'XTS', 'XUA', 'YER', 'ZAR', 'ZMW', 'ZWL' ] currency_dict = dict({' ': currency_codes}) keyword_processor = KeywordProcessor() keyword_processor.add_keywords_from_dict(currency_dict) cleaned_text = keyword_processor.replace_keywords(cleaned_text) # remove whitespace cleaned_text = ' '.join(cleaned_text.split()) print('step 6') sentences = sent_tokenize(cleaned_text) sentences = list(filter(None, sentences)) output_file_name = input("Please enter the output file name : ") with open(output_file_name, "w") as f: for sentence in sentences: f.write(sentence + '\n')
#------------------------------- TOKENIZATION-------------------------------------- from nltk.tokenize import sent_tokenize, word_tokenize text = "Twitter was created in March 2006 by Jack Dorsey,"+ \ "Noah Glass, Biz Stone, and Evan Williams and launched " +\ "in July of that year. The service rapidly gained worldwide "+\ "popularity. In 2012, more than 100 million users posted 340 "+\ "million tweets a day, and the service handled an average of "+\ "1.6 billion search queries per day." print "------------------------------- TOKENIZATION--------------------------------------" print "\nSENTENCE TOKENIZATION: ",sent_tokenize(text) print "\nWORD TOKENIZATION: ",word_tokenize(text) #-------------------------STOP WORD REMOVAL-------------------------------------- from nltk.corpus import stopwords from nltk.tokenize import word_tokenize stop_words = set(stopwords.words('english')) stop_words.add('.') stop_words.add(',') word_tokens = word_tokenize(text) filtered_sentence = [w for w in word_tokens if not w in stop_words] filtered_sentence = [] for w in word_tokens: if w not in stop_words:
import string from pprint import pprint from nltk.corpus import brown from nltk.tokenize import sent_tokenize from nltk.corpus import stopwords from nltk.stem.porter import PorterStemmer from nltk.stem.snowball import SnowballStemmer from nltk.stem.wordnet import WordNetLemmatizer from nltk.chunk import ne_chunk from nltk.draw.tree import TreeView text = '''Another ex-Golden Stater, Paul Stankowski from Oxnard, is contending for a berth on the U.S. Ryder Cup team after winning his first PGA Tour event last year and staying within three strokes of the lead through three rounds of last month's U.S. Open. H.J. Heinz Company said it completed the sale of its Ore-Ida frozen-food business catering to the service industry to McCain Foods Ltd. for about $500 million. It's the first group action of its kind in Britain.''' print(text) # Sentence splitting nltk_sentence_splitted = sent_tokenize(text) for index, sentence in enumerate(nltk_sentence_splitted, 1): print(f'SENTENCE {index}: {sentence}') # Tokenization example_sentence = "I'll refuse to permit you to obtain the refuse permit." tokenized = nltk.word_tokenize(example_sentence) print(tokenized) # Part of speech tagging pos_tagged = nltk.pos_tag(tokenized) print(pos_tagged) # Remove stop words english_stopwords = stopwords.words('english') set_english_stopwords = set(english_stopwords) # sets are faster to check if an element is in
return list(set(all_pairs)) # #testing # corpus = "a b. a b c. a b c d. b c. b c d. c d. d c. d c b. d c b a. a b c d e. e d. e d c." unique_words = get_unique_words(corpus) n = len(unique_words) w2id, id2w = w2id_id2w_maps(unique_words) #create empty cooccurence matrix A = np.zeros([n,n],np.float32) #compute cooccurence matrix sentences = sent_tokenize(corpus) for s in sentences: s = process_text(s) max_distance = len(s) + 1 s = [w2id[w] for w in s] #convert words to ids for d in range(2,max_distance): pairs = cooccurence_pair_of_distance(s, d) #update cooccurence matrix for each pair for p in pairs: A[p[0],p[1]] += ngram_inc_amt(d) A[p[1],p[0]] += ngram_inc_amt(d) #finished cooccurence matrix A print (w2id)
def get_documents_bbc(tree): documents = [] prev_hour = ['00:00'] articles = tree.xpath(".//article") for article in articles: # source title source_title = article.xpath( './/header[@class="lx-stream-post__header gs-o-media"]') if len(source_title) == 1: source_title = text_normalization( BeautifulSoup(html.tostring(source_title[0]), "html.parser").get_text()) # hour hour = re.findall( r"[0-9]{2}:[0-9]{2}", html.tostring(article)) # get the hour linked to the article if not hour: hour = prev_hour # lines lines = article.xpath('.//div[@class="lx-stream-post-body"]//p') # text text_lines = [] if len(lines) >= 1: for line in lines: text_lines.append( BeautifulSoup(html.tostring(line), "html.parser").get_text()) # author author = article.xpath( './/div[@class="lx-stream-post__contributor gs-o-media"]') if len(author) == 1: author = author[0].xpath( ".//p/text()" ) # get the description of the author of the article else: author = '' # extract the links form the block lines = article.xpath('.//div[@class="lx-stream-post-body"]') if len(lines) == 1: cont = html.tostring(lines[0]) links = set( re.findall(r'https?://[a-z\.]+/[a-z\-_0-9/]+\.[a-z]{2,4}', cont)) links = links.union( re.findall(r'https?://[A-Za-z\.]+/[A-Za-z\-_0-9/]+', cont)) else: cont = html.tostring(article) links = set( re.findall(r'https?://[a-z\.]+/[a-z\-_0-9/]+\.[a-z]{2,4}', cont)) links.union( re.findall(r'https?://[A-Za-z\.]+/[A-Za-z\-_0-9/]+', cont)) try: for link in links: # print link if "https://twitter.com/" in link and "status" in link: # we extract the content from the twitter status twi_page = get(link).text twi_tree = html.fromstring(twi_page) tweets = twi_tree.xpath( '//p[contains(@class, "tweet-text")]') if len(tweets) >= 1: for tweet in tweets: twi_text = BeautifulSoup(html.tostring(tweet), "html.parser").get_text() text_lines.append(twi_text) except: pass # retrieving of links in the text block_id = article.get("id") block_text = [ sent_tokenize(text_normalization(line.strip())) for line in text_lines if line.strip() != u"" ] block_text = list(itertools.chain.from_iterable(block_text)) if len(block_text) == 1: if block_text[0] == '': block_text = [source_title] if len(block_text) == 0: block_text = [source_title] d_block = { "time": hour[0], "text": block_text, "block_id": block_id, "author": author, "title": source_title } prev_hour = hour documents.append(d_block) return documents
def geigerize(): """ Selects highlights from submitted comments using the specified strategy. """ data = request.get_json() strat = data['strategy'] # Wrangle posted comments into the minimal format needed for processing. comments = [ Comment({ 'commentID': c['id'], 'commentBody': c['body_html'], 'recommendations': c['score'], 'userDisplayName': c['author'], 'createDate': 0, 'replies': [] # ignoring replies for now }) for c in data['comments'] ] results = [] if config.sentences: # Try out sentences as the object sentences = [[Sentence(sent, c) for sent in sent_tokenize(c.body)] for c in comments] sentences = [s for sents in sentences for s in sents] # Run the specified strategy. raw_results = getattr(geiger, strat)(sentences) # Format results into something jsonify-able. for r in raw_results: s = r[1] results.append({ 'sentence': r[0], 'comment': { 'id': s.comment.id, 'body': s.body, 'author': s.comment.author }, 'support': int(r[2]), 'cohort': [c.body for c in r[3]] }) else: raw_results = getattr(geiger, strat)(comments) # Format results into something jsonify-able. for r in raw_results: comment = r[1] results.append({ 'sentence': r[0], 'comment': { 'id': comment.id, 'body': comment.body, 'author': comment.author }, 'support': int(r[2]), 'cohort': [c.body for c in r[3]] }) return jsonify(results=results)
from nltk import ne_chunk from collections import Counter from nltk.util import ngrams from nltk.stem import PorterStemmer from nltk.stem import WordNetLemmatizer #nltk.download('punkt') #nltk.download('word_tokenize') #nltk.download('wordpunct_tokenize') #nltk.download('sent_tokenize') s = "Hi this is a python program. I am doing the in class exercise. Time given to solve is 300 minutes." meaning = wn.synsets('program') for a in meaning: print(a.definition()) print([str(syns.definition) for syns in meaning]) x = sent_tokenize(s) print x for t in x: print word_tokenize(t) #lemmetization print "lemmatization:" lemmatizer = WordNetLemmatizer() print lemmatizer.lemmatize('cooking') print lemmatizer.lemmatize('cooking', pos='v') #stemming stemmer = PorterStemmer() print "stemming" print stemmer.stem('cooking') #nltk.download('all')
def predict(pathToModels, pageParsedContent): import numpy as np import pandas as pd import requests import matplotlib.pyplot as plt from bs4 import BeautifulSoup data = pd.read_csv(pathToModels + '/Tech/odf_scraped.csv') X = data.iloc[:, 0] y = data.iloc[:, 1] from sklearn.feature_extraction.text import CountVectorizer cv = CountVectorizer(max_features=200) X_vec = cv.fit_transform(X) from sklearn.feature_extraction.text import TfidfTransformer tf = TfidfTransformer() X_tf = tf.fit_transform(X_vec) from sklearn.linear_model import LogisticRegression lr = LogisticRegression() lr.fit(X_tf, y) print('done successfuly till here') soup = pageParsedContent main_title = soup.find('title') main_title = main_title.get_text() main_title = main_title.replace(",", " ") with open(pathToModels + '/Tech/final_testing.txt', 'w') as f: f.write(main_title + '\n') f.close() with open(pathToModels + '/Tech/final_testing.txt', 'r') as f: text_test = f.read() f.close() text_test = [text_test] text_test_cv = cv.transform(text_test) text_test_tf = tf.transform(text_test_cv) u = lr.predict(text_test_tf) print(u) if (u == 1): return u #output = round(prediction[0], 2) import nltk from nltk.tokenize import word_tokenize from nltk.corpus import stopwords sw = set(stopwords.words('english')) num = [] for i in range(0, 3000): i = str(i) num.append(i) mx = [ '(', ')', ',', '.', '?', '#', '@', '!', '[', ']', '<', '>', '/', ' ', '|', "''", '...', ':' ] soup = pageParsedContent body_content = soup.body for script in body_content(["script", "style"]): script.decompose() body_content_text = body_content.get_text() body_content_text = body_content_text.replace('\n', ' ') body_content_text = body_content_text.replace('\t', ' ') body_content_text = body_content_text.replace("'", ' ') with open(pathToModels + '/Tech/final_scraped_content_test.txt', 'w') as f: f.write(body_content_text) f.close() with open(pathToModels + '/Tech/final_scraped_content_test.txt', 'r') as f: new_text = f.read() f.close() from nltk.tokenize import sent_tokenize sent = sent_tokenize(new_text) data = [] for i in sent: words = word_tokenize(i) for w in words: if w not in sw and w not in mx and w not in num: data.append(w) from collections import Counter count = Counter(data) count = sorted(count.items(), key=lambda x: x[1], reverse=True) tech = [ 'Samsung', 'Xiaomi', 'Lenovo', 'Vivo', 'Oppo', 'Apple', 'LG', 'Nokia', 'OnePlus', 'HTC', 'Huawei', 'Amazon', 'Flipkart', 'Google', 'Mobile', 'Smartphones', 'Smartphone', 'Laptop', 'AI', 'Robots', 'Gaming', 'Headphone', 'PC', 'Computers', 'Apps', 'App', 'Phones', 'Phone', 'Smart', 'Android', 'iOS', 'Snapdragon', 'Qualcom', 'Intel', 'Startups', 'Startups', 'smartwatch', 'Realme', 'Redmi', 'Bluetooth', 'Camera' ] cc = 0 ff = 0 for i in range(0, len(count) - 1): de = count[i][1] if (de == 1): ff = ff + 1 else: cc = cc + de score = 0 b = 0 t = 0 for i in range(0, len(count) - 1): a = count[i][0] b = count[i][1] for k in tech: if (a == k): print(a) b = b / cc t = t + b print(b) score = score + 1 fp = count[0][1] xy = fp / cc if (b > xy and score > 7): return 1 else: return 0
# use nltk separate the sentences from nltk.tokenize import sent_tokenize parag = "My name is roky . i like nltk . i like python" print(sent_tokenize(parag)) myArr = sent_tokenize(parag) print(myArr) print(myArr[2])
print(word_tokenize(data)) # <a id="551"></a> <br> # All of them are words except the comma. Special characters are treated as separate tokens. # # ## 5-5-1 Tokenizing sentences # The same principle can be applied to sentences. Simply change the to sent_tokenize() # We have added two sentences to the variable data: # In[ ]: from nltk.tokenize import sent_tokenize, word_tokenize data = "All work and no play makes jack dull boy. All work and no play makes jack a dull boy." print(sent_tokenize(data)) # <a id="552"></a> <br> # ## 5-5-2 NLTK and arrays # If you wish to you can store the words and sentences in arrays # In[ ]: from nltk.tokenize import sent_tokenize, word_tokenize data = "All work and no play makes jack dull boy. All work and no play makes jack a dull boy." phrases = sent_tokenize(data) words = word_tokenize(data)
":) and :D", # emoticons handled "", # an empty string is correctly handled "Today sux", # negative slang handled "Today sux!", # negative slang with punctuation emphasis handled "Today SUX!", # negative slang with capitalization emphasis "Today kinda sux! But I'll get by, lol" # mixed sentiment example with slang and constrastive conjunction "but" ] paragraph = "It was one of the worst movies I've seen, despite good reviews. \ Unbelievably bad acting!! Poor direction. VERY poor production. \ The movie was bad. Very bad movie. VERY bad movie. VERY BAD movie. VERY BAD movie!" from nltk import tokenize lines_list = tokenize.sent_tokenize(paragraph) sentences.extend(lines_list) tricky_sentences = [ "Most automated sentiment analysis tools are shit.", "VADER sentiment analysis is the shit.", "Sentiment analysis has never been good.", "Sentiment analysis with VADER has never been this good.", "Warren Beatty has never been so entertaining.", "I won't say that the movie is astounding and I wouldn't claim that \ the movie is too banal either.", "I like to hate Michael Bay films, but I couldn't fault this one", "It's one thing to watch an Uwe Boll film, but another thing entirely \ to pay for it", "The movie was too good", "This movie was actually neither that funny, nor super witty.",
print("*****************Data with no stop words*****************") print(text_NoStopWords) # Deleting puntuation marks from the text.file punct = set(string.punctuation) text_NoPunct = ''.join(x for x in text_NoStopWords if x not in punct) print("*******************Data without punctuation******************") print(text_NoPunct) # Step 3 & 4: Removing verbs from the text by applying word tokenizing and POS words = word_tokenize(text_NoPunct) tokes_pos = nltk.pos_tag(words) for i in tokes_pos: if 'VB' not in i[1]: words_NoVerbs.append(i[0]) print("*****************Data with no verbs************") print(words_NoVerbs) # Step 5 & 6: Fetching the top 5 most occurring words counts = Counter(words_NoVerbs).most_common(5) print("************Most repeated words in the tetx file************") print(counts) # Steps 7 to 10: Concatenating and printing the statements containing the most frequent words for top in counts: for sent in sent_tokenize(data.lower()): if sent not in text_res: if top[0] in word_tokenize(sent): text_res = text_res + sent print("**************Final text************") print(text_res)
def annotate(self, essay_text): try: sentences = sent_tokenize(essay_text.strip()) contents = "\n".join(sentences) essay = Essay(full_path=None, include_vague=self.config["include_vague"], include_normal=self.config["include_normal"], load_annotations=False, essay_text=contents) processed_essays = process_essays( essays=[essay], spelling_corrector=self.spelling_corrector, wd_sent_freq=self.wd_sent_freq, remove_infrequent=self.config["remove_infrequent"], spelling_correct=self.config["spelling_correct"], replace_nums=self.config["replace_nums"], stem=self.config["stem"], remove_stop_words=self.config["remove_stop_words"], remove_punctuation=self.config["remove_punctuation"], lower_case=self.config["lower_case"]) self.logger.info("Essay loaded successfully") essays_TD = self.feature_extractor.transform(processed_essays) wd_feats, _ = flatten_to_wordlevel_feat_tags(essays_TD) xs = self.feature_transformer.transform(wd_feats) wd_predictions_by_code = test_classifier_per_code( xs, self.tag_2_wd_classifier, self.wd_test_tags) dummy_wd_td_ys_bytag = defaultdict( lambda: np.asarray([0.0] * xs.shape[0])) sent_xs, sent_ys_bycode = get_sent_feature_for_stacking_from_tagging_model( self.sent_input_feat_tags, self.sent_input_interaction_tags, essays_TD, xs, dummy_wd_td_ys_bytag, self.tag_2_wd_classifier, sparse=True, look_back=0) """ Test Stack Classifier """ sent_predictions_by_code = test_classifier_per_code( sent_xs, self.tag_2_sent_classifier, self.sent_output_train_test_tags) """ Generate Return Values """ essay_tags = self.__get_essay_tags_(sent_predictions_by_code) essay_type = None if "coral" in self.essays_folder.lower(): essay_type = "CB" elif "skin" in self.essays_folder.lower(): essay_type = "SC" else: raise Exception("Unknown essay type") raw_essay_tags = ",".join(sorted(essay_tags, key=cr_sort_key)) t_words = self.__get_tagged_words_(essay, essays_TD[0], wd_predictions_by_code) t_sentences = self.__get_tagged_sentences_( essay, sent_predictions_by_code) tagged_sentences = [ t_sent.add_word_tags(map(lambda twd: twd.__dict__, t_wds)).__dict__ for t_sent, t_wds in zip(t_sentences, t_words) ] essay_codes, essay_causal = self.__format_essay_tags_(essay_tags) return { "tagged_sentences": tagged_sentences, "essay_codes": essay_codes, "essay_causal": essay_causal, "essay_category": essay_category(raw_essay_tags, essay_type), "raw_essay_tags": raw_essay_tags } except Exception as x: self.logger.exception( "An exception occured while annotating essay") return {"error": format_exc()} pass
def summarize(text, target_sentences=3): """ Given all the text in a page, determine a number of summarizing sentences. """ def page_rank(G, s=.85, maxerr=.001): G = np.array(G) n = G.shape[0] # transform G into markov matrix M M = csc_matrix(G, dtype=np.float) rsums = np.array(M.sum(1))[:, 0] ri, ci = M.nonzero() M.data /= rsums[ri] # bool array of sink states sink = rsums == 0 # Compute pagerank r until we converge ro, r = np.zeros(n), np.ones(n) while np.sum(np.abs(r - ro)) > maxerr: ro = r.copy() # calculate each pagerank at a time for i in xrange(0, n): # inlinks of state i Ii = np.array(M[:, i].todense())[:, 0] # account for sink states Si = sink / float(n) # account for teleportation to state i Ti = np.ones(n) / float(n) r[i] = ro.dot(Ii * s + Si * s + Ti * (1 - s)) # return normalized pagerank return r / sum(r) def vec_tfidf(sent): words = map(lambda s: s.lower(), nltk.word_tokenize(sent)) counts = {} for w in words: if w not in counts: counts[w] = 0 counts[w] += 1 max_count = max(counts.values()) tfidf = {} for word in counts.keys(): tfidf[word] = (0.5 + 0.5 * counts[word] / max_count) tfidf[word] *= math.log(1 + brown_sent_count / total_word_counts.get(word, 1)) return tfidf def cos_similarity(vec1, vec2): num = 0 denom = sum(vec1.values()) * sum(vec2.values()) for e in vec1.keys(): if e in vec2: num += vec1[e] * vec2[e] return 1.0 * num / denom tfidf = [] sentences = tokenize.sent_tokenize(text) for sent in sentences: tfidf.append(vec_tfidf(sent)) matrix = [] for v1 in tfidf: row = [] for v2 in tfidf: row.append(cos_similarity(v1, v2)) matrix.append(row) scores = page_rank(matrix) return [ sentences[i] for i in sorted(range(len(scores)), key=lambda x: -scores[x]) ][:target_sentences]
"dropout_keep_prob").outputs[0] # Tensors we want to evaluate predictions = graph.get_operation_by_name( "output/predictions").outputs[0] # Generate batches for one epoch batches = dh.batch_iter(list(x_test), FLAGS.batch_size, 1, shuffle=False) all_predictions = [] for x_test_batch in batches: batch_predictions = sess.run(predictions, { input_x: x_test_batch, dropout_keep_prob: 1.0 }) all_predictions = np.concatenate( [all_predictions, batch_predictions]) ret = dict() ret['results'] = list() pred_sentence = sent_tokenize(inputEssay) for i in range(len(all_predictions)): if (all_predictions[i] == 1): ret['results'].append(pred_sentence[i]) print(json.dumps(ret)) # print(inputEssay)
from nltk.tokenize import sent_tokenize, word_tokenize example = "Hello My name is Muhammad Aashir And i lives in multan, pakistan " ab = sent_tokenize(example) print('This is Sent Tokenizer = ', ab)