def __init__(self): with open(TREETAGGER_ABBREVIATIONLIST, mode='r', encoding='utf-8') as f: abbr = set([l.strip('.\n') for l in f.readlines()]) punkt_param = PunktParameters() punkt_param.abbrev_types = abbr self.tokenizer = PunktSentenceTokenizer(punkt_param)
def parseTextToSentences(text): punkt_param = PunktParameters() punkt_param.abbrev_types = set(['dr', 'vs', 'mr', 'ms', 'mrs', 'prof', 'inc', 'no', 'e.g', 'i.e']) sentence_splitter = PunktSentenceTokenizer(punkt_param) data = text data = data.replace('?"', '? "').replace('!"', '! "').replace('."', '. "') sentences = [] for para in data.split('\n'): if para: sentences.extend(sentence_splitter.tokenize(para)) return sentences
def summarize(self): punkt_param = PunktParameters() punkt_param.abbrev_types = set(['dr', 'vs', 'mr', 'mrs', 'prof', 'inc']) sentence_splitter = PunktSentenceTokenizer(punkt_param) sentences = sentence_splitter.tokenize(self.text) structure = {} sentence_objects = [] for idx in range(len(sentences)): obj = {'text' : sentences[idx], 'index' : idx , 'data': {}} sentence_objects.append(obj) structure['sentences'] = sentence_objects self.sentencecount = len(structure['sentences']) structure['ordered'] = [] structure['weights'] = {'words' : FreqDist(nltk.word_tokenize(preprocess(self.text))), 'total': 0, 'transformed': 0} structure['weights']['total'] = sum(structure['weights']['words'].values()) self.sentenceIndex = 0 for each_sent in structure['sentences']: each_sent['data']['tokens'] = nltk.word_tokenize(preprocess(each_sent['text'])) each_sent['data']['sinTransform'] = (1-math.sin(self.sentenceIndex*(math.pi/self.sentencecount)))+1 for each_word in structure['weights']['words']: if each_word in each_sent['data']['tokens']: structure['weights']['words'][each_word] *= each_sent['data']['sinTransform'] self.sentenceIndex += 1 structure['weights']['transformed'] = sum(structure['weights']['words'].values()) self.sentenceIndex = 0 for each_sent in structure['sentences']: each_sent['data']['weights'] = {'words': self.calculate_relative_frequence(each_sent['data']['tokens'], structure['weights']['words']), 'total': 0} each_sent['data']['weights']['total'] = sum(each_sent['data']['weights']['words'].values()) self.sentenceIndex += 1 structure['ordered'] = sorted(structure['sentences'], key=lambda x:x['data']['weights']['total'], reverse=True) structure_keep = structure['ordered'][:self.quota] structure_keep.sort(key=lambda x:x['index']) for eac_sen in structure_keep: self.summary.append(eac_sen['text'])
def tokenise(self, sample): # first pass - look for poems verses = self.scan_for_verse(sample) if verses: self.notes.append("got {} verses".format(len(verses))) verses = [ re.sub(r'\[\d+\]', '', v) for v in verses ] else: verses = [] # second pass - look for sentences text = re.sub(r'\[\d+\]', '', sample) text = re.sub("\r\n", ' ', text) punkt_param = PunktParameters() punkt_param.abbrev_types = set(self.cf['abbreviations']) tokenizer = PunktSentenceTokenizer(punkt_param) sentences = tokenizer.tokenize(text) sentences = sentences[1:-1] self.notes.append("got {} sentences".format(len(sentences))) # remove any sentences which we already found as part of verses for s in sentences: matches = [ v for v in verses if s[:SENTENCE_MATCH] in v ] if matches: self.notes.append("found sentence {} in verses {}".format(s, matches)) sentences.remove(s) verses.extend(sentences) return verses
def fractal_representation(self): punkt_param = PunktParameters() for each_paragraph in self.paragraphs: buffer_p = paragraph() buffer_p.paragraph = each_paragraph buffer_p.tokens = nltk.word_tokenize(preprocess(each_paragraph)) buffer_p.weights['words'] = FreqDist(buffer_p.tokens) buffer_p.weights['total'] = {'words':0, 'sentences':0} punkt_param.abbrev_types = set(['dr', 'vs', 'mr', 'mrs', 'prof', 'inc']) sentence_splitter = PunktSentenceTokenizer(punkt_param) sentences = sentence_splitter.tokenize(each_paragraph) for each_sentence in sentences: self.stotal += 1 buffer_s = sentence() buffer_s.sentence = each_sentence buffer_s.tokens = nltk.word_tokenize(preprocess(each_sentence)) if len(buffer_s.tokens) > 0: buffer_s.weights['sentence'] = FreqDist(buffer_s.tokens) buffer_s.weights['paragraph'] = self.calculate_relative_frequence(buffer_s.tokens, buffer_p.weights['words']) buffer_s.weights['document'] = self.calculate_relative_frequence(buffer_s.tokens, self.fractal.weights) buffer_s.weights['total'] = {} buffer_s.weights['total']['sentence'] = 1 buffer_s.weights['total']['paragraph'] = sum(buffer_s.weights['paragraph'].values()) buffer_s.weights['total']['document'] = sum(buffer_s.weights['document'].values()) self.s_weight += buffer_s.weights['total']['document'] buffer_p.weights['total']['sentences'] += buffer_s.weights['total']['document'] buffer_p.sentences.append(buffer_s) buffer_p.weights['total']['words'] = sum(buffer_p.weights['words'].values()) self.fractal.paragraphs.append(buffer_p) self.pindex += 1
def getSentences(text): #returns a list of sentences tokenized by Punkt punkt_param = PunktParameters() punkt_param.abbrev_types = set(['dr', 'vs', 'mr', 'mrs', 'prof', 'inc']) sentence_splitter = PunktSentenceTokenizer(punkt_param) sentences = sentence_splitter.tokenize(text) return sentences
def retrieveUrlText(url): try: config = Config() config.request_timeout = 1000 config.memoize_articles = False config.fetch_images = False config.browser_user_agent = 'Mozilla/5.0' article = Article(url, config) article.download(recursion_counter=5) if article.download_state != 2: return '' article.parse() articleText = article.text.replace('\n', ' ') except KeyboardInterrupt: raise except Exception: return '' punkt_param = PunktParameters() punkt_param.abbrev_types = set([ 'dr', 'vs', 'mr', 'mrs', 'prof', 'inc', 'et', 'al', 'fig', 'figs', 'chem', 'ph' ]) sentence_splitter = PunktSentenceTokenizer(punkt_param) articleSentences = validateSentences( sentence_splitter.tokenize(articleText)) return articleSentences
def _split_sentences(self, text): from nltk.tokenize.punkt import PunktSentenceTokenizer, PunktParameters punkt_param = PunktParameters() punkt_param.abbrev_types = set(['dr', 'vs', 'mr', 'mrs', 'prof', 'inc']) sentence_splitter = PunktSentenceTokenizer(punkt_param) sentences = sentence_splitter.tokenize(text) return sentences
def __getlemmas(self, txt): ''' Filters noun, adjective and verb from input text, lemmatize them and returns as list of words(tokens) Parameters: @txt : The text file (str format) which must be lemmatized ''' lemma = WordNetLemmatizer() punkts = PunktParameters() punkts.abbrev_types = set(['dr', 'vs', 'mr', 'mrs', 'prof', 'inc']) sent_tokenizer = PunktSentenceTokenizer(punkts) sentences = sent_tokenizer.tokenize(txt) lemma_tokens = [] for sentence in sentences: stoken = word_tokenize(sentence) pos_sent = pos_tag(stoken) for p in pos_sent: if p[1].startswith('N'): pos = wordnet.NOUN elif p[1].startswith('J'): pos = wordnet.ADJ elif p[1].startswith('V'): pos = wordnet.VERB else: pos = None if pos: lemma_tokens.append(lemma.lemmatize(p[0].lower(), pos)) return lemma_tokens
def read_docx(path): """read .docx (Microsoft 2007+) """ try: doc = docx.Document(path) punkt_param = PunktParameters() punkt_param.abbrev_types = set(['fig']) tokenizer = PunktSentenceTokenizer(punkt_param) body = [] for p in doc.paragraphs: body += tokenizer.tokenize(clean_text(p.text)) body = '\n'.join(body) tables = [] for t in doc.tables: table = {'cells': []} for row in t.rows: row_elements = [] for cell in row.cells: for p in cell.paragraphs: row_elements.append({'text': clean_text(p.text)}) table['cells'].append(row_elements) tables.append(table) data = PaperData(body, tables) except Exception: logger.info('fail: %s', path) traceback.print_exc() return PaperData() return data
def process_doc(self, xmlfile): # Set up sentence tokenizer punkt_param = PunktParameters() # Domain specific abbreviations punkt_param.abbrev_types = set(["e.g", "al", "i.e"]) sent_tokenize = PunktSentenceTokenizer(punkt_param).tokenize tree = etree.parse(xmlfile) algrthms = tree.getroot() block = algrthms.iterdescendants(["sectionHeader", "bodyText"]) section = "" counter = 0 sentences = [] try: while True: blk = block.next() if blk.tag == "sectionHeader": section = blk.get("genericHeader") # check if the next blk is bodytext. it might be a section sentences = sent_tokenize(remove_crlf(block.next().text)) self.update_section(section, OrderedDict(enumerate(sentences, start=counter))) else: sentences = sent_tokenize(remove_crlf(blk.text)) self.update_section(section, OrderedDict(enumerate(sentences, start=counter))) counter += len(sentences) except StopIteration: pass except Exception as e: logit("Something went wrong while processing the document!") logit(section) logit(str(e))
def clean(text): # Returns cleaned, tokenized documents from raw HTML text. text = cleanmyhtml(url) # We need to remove things like (R-NE). There are some wacky abbreviations # for states, but all fall under five. text = re.sub(r'\w{1}\-\w{1,5}\.', '', text) # U.S. needs to become US or else it'll tokenize weirdly. Same with # H.R. (house resolution). text = re.sub(r'U\.S\.', 'US', text) text = re.sub(r'H\.R\.', 'HR', text) # NLTK is pretty poor at tokenizing sentences that contain ." or .' # We'll insert a space into these. text = re.sub(r'\.\"', '. \"', text) text = re.sub(r'\"\.', '. \'', text) punkt_param = PunktParameters() punkt_param.abbrev_types = set([ 'dr', 'reps', 'Reps', 'H.R', 'h.r', 'hr', 'HR', 'vs', 'mr', 'ms', 'pres,', 'mrs', 'prof', 'inc', 'sens', 'Sens', 'Sen', 'sen' ]) sentence_splitter = PunktSentenceTokenizer(punkt_param) sentences = sentence_splitter.tokenize(text) return (sentences)
def summarize(text, ref='', lines=7): text = re.sub(r'\[[0-9]*\]', ' ', text) text = re.sub(r'\s+', ' ', text) clean_text = text.lower() clean_text = re.sub(r'\W', ' ', clean_text) clean_text = re.sub(r'\d', ' ', clean_text) clean_text = re.sub(r'\s+', ' ', clean_text) punkt_param = PunktParameters() punkt_param.abbrev_types = set( ['dr', 'vs', 'mr', 'mrs', 'prof', 'inc', 'i.e']) sentence_splitter = PunktSentenceTokenizer(punkt_param) text = text.replace('?"', '? "').replace('!"', '! "').replace('."', '. "') sentences = sentence_splitter.tokenize(text) #sentences = nltk.sent_tokenize(text) stop_words = nltk.corpus.stopwords.words('english') word_count = {} for word in nltk.word_tokenize(clean_text): if word not in stop_words: word_count[word] = word_count.get(word, 0) + 1 sentence_score = {} i = 0 for s in sentences: for word in nltk.word_tokenize(s.lower()): if word in word_count.keys(): old = sentence_score.get(s, (0, 0, i)) i += 1 sentence_score[s] = (old[0] + word_count[word], old[1] + 1, old[2]) def score(pair): return (pair[0] - pair[2]) / pair[1] scores = {} for s in sentence_score.keys(): if sentence_score[s][1] > 2: scores[s] = score(sentence_score[s]) else: scores[s] = score(sentence_score[s]) - 100 best_sentences = heapq.nlargest(lines, scores, key=scores.get) best_sentences.sort(key=lambda x: sentence_score[x][2]) string = '' for s in best_sentences: if s[0] == ' ': s = s[1:] if 'refer' in s and len(scores.keys()) < 4: print('Please be more specific\n') if len(ref) > 1: print('Here are some suggestions:') for i in range(len(ref)): print("=>", ref[i]) print('\n') return print(s) string += s + '\n' return string
def _punkt_sent_tokenize(text): ''' Sentence segmentation using nltk PunktSentenceTokenizer. ''' punkt_param = PunktParameters() punkt_param.abbrev_types = set(config.tokenize_abbrev) sentence_splitter = PunktSentenceTokenizer(punkt_param) return sentence_splitter.tokenize(text)
def _split_text_to_sentences(self, text): # splits text to sentences (uses some utilities from nltk) punkt_param = PunktParameters() punkt_param.abbrev_types = set( ['dr', 'vs', 'mr', 'mrs', 'prof', 'inc']) sentence_splitter = PunktSentenceTokenizer(punkt_param) sentences = sentence_splitter.tokenize(text) return sentences
def sentenceToken(self, text): """Split review context into a list of sentences. Text: a sentence. """ punkt_param = PunktParameters() punkt_param.abbrev_types = set(['dr', 'vs', 'mr', 'mrs']) tokenizer = PunktSentenceTokenizer(punkt_param) return tokenizer.tokenize(text)
def parse (text): """Use nltk's PunktSentenceTokenizer to convert the text string into a list of English-language sentences.""" punkt_param = PunktParameters() punkt_param.abbrev_types = set(ABBREVIATIONS) sentence_splitter = PunktSentenceTokenizer(punkt_param) return sentence_splitter.tokenize(preprocess(text))
def tokenize_to_sentences2(doc): punkt_param = PunktParameters() abbreviations = [ "u.s.a", "fig", "gov", "sen", "jus", "jdg", "rep", "pres", "mr", "mrs", "ms", "h.r", "s.", "h.b", "s.b", "u.k", "u.n", "u.s.s.r", ] punkt_param.abbrev_types = set(abbreviations) tokenizer = PST(punkt_param) return tokenizer.tokenize(doc)
def __init__(self, abbrev=['dr', 'vs', 'mr', 'mrs', 'prof', 'inc', 'i.e']): """Initialize Textualizer. Usually, you need to create only one textualizer in your script. Args: abbrev (list): List of abbreviations """ punkt = PunktParameters() punkt.abbrev_types = set(abbrev) self.tokenizer = PunktSentenceTokenizer(punkt)
def _nltk_tokenizer(self, document): abbreviation = ['sra', 'dª', 'dña', 'sras', 'sres', 'sr', 'excmos', 'excmo', 'excma', 'excmas', 'ilma', 'ilmas', 'ilmo', 'ilmos', 'ilma', 'ilmas', 'art', 'arts', 'núm', 'cp', 'c.p', 's.l', 'rcud', 'rcuds', 'rec'] punkt_param = PunktParameters() punkt_param.abbrev_types = set(abbreviation) sentence_splitter = PunktSentenceTokenizer(punkt_param) text = document sentences = sentence_splitter.tokenize(text) return sentences
def rank(self, doc="doc.txt", out="doc_textrank.txt", top=10, stop_word=False, stem=False): with open(doc, "r") as lofile: document = lofile.read() # == refine document for process == document = document.replace('\n', ' ')\ .replace('."', '. "').replace('?"', '? "').replace('!"', '! "')\ .replace('.”', '. ”').replace('?”', '? ”').replace('!”', '! ”')\ .decode('utf-8') # document = ' '.join(document.strip().split('\n')) # == sentences tokenize == punkt_param = PunktParameters() punkt_param.abbrev_types = set( ['dr', 'vs', 'mr', 'mrs', 'prof', 'inc']) sentence_tokenizer = PunktSentenceTokenizer(punkt_param) sentences = sentence_tokenizer.tokenize(document) # FEATURE: stem words, remove stop words # == count words for each sentence == wordCounter = CountVectorizer() # approach 0: non if stop_word: wordCounter = CountVectorizer( stop_words='english') # approach 1: only stop_word if stem: wordCounter = CountVectorizer( stop_words='english', preprocessor=stemSen) # approach 2: stop_word & stem count_matrix = wordCounter.fit_transform(sentences) normalized_matrix = TfidfTransformer().fit_transform(count_matrix) # wordCounter = TfidfVectorizer() # normalized_matrix = wordCounter.fit_transform(sentences) # == similarity among sentences == similarity_graph = normalized_matrix * normalized_matrix.T nx_graph = nx.from_scipy_sparse_matrix(similarity_graph) scores = nx.pagerank(nx_graph) orderedSentences = sorted( ((scores[i], s) for i, s in enumerate(sentences)), reverse=True) if len(orderedSentences) < top: top = len(orderedSentences) with open(out, "w") as lofile: for i in range(0, top): lofile.write(orderedSentences[i][1].encode('ascii', 'ignore')) lofile.write("\n")
def nltk_get_tokenizer(): """ Return a tokenizer with some customization for Hansard :return: a Punkt tokenizer """ # With thanks to # https://stackoverflow.com/questions/34805790/how-to-avoid-nltks-sentence-tokenizer-spliting-on-abbreviations punkt_param = PunktParameters() # 'hon. Gentleman' is very common in Hansard! abbreviation = ['hon', 'mr', 'mrs', 'no'] punkt_param.abbrev_types = set(abbreviation) return PunktSentenceTokenizer(punkt_param)
def segmentPureText(self, txtfile): punkt_param = PunktParameters() abbreviation = [ "U.S.A", "u.s.a", "figure", "fig", "Table", "table", "Eq", "eq", "equation", "et al", "e.g", "i.e", "Fig", "s.d", "etc", "i.v" ] punkt_param.abbrev_types = set(abbreviation) tokenizer = PunktSentenceTokenizer(punkt_param) tokenized_output = tokenizer.tokenize(txtfile) # print(tokenized_output) return tokenized_output
def parse_text_to_sentences(text): punkt_param = PunktParameters() punkt_param.abbrev_types = set( ['dr', 'vs', 'mr', 'ms', 'mrs', 'prof', 'inc', 'no', 'e.g', 'i.e']) sentence_splitter = PunktSentenceTokenizer(punkt_param) data = text data = data.replace('?"', '? "').replace('!"', '! "').replace('."', '. "') sentences = [] for para in data.split('\n'): if para: sentences.extend(sentence_splitter.tokenize(para)) return sentences
def tokenize_sentences(text): from nltk.tokenize.punkt import PunktSentenceTokenizer, PunktParameters, PunktLanguageVars class CommaPoint(PunktLanguageVars): sent_end_chars = ('.', '?', '!') punkt_param = PunktParameters() punkt_param.abbrev_types = set(['dr', 'vs', 'al', 'i.v']) sentence_splitter = PunktSentenceTokenizer(punkt_param, lang_vars=CommaPoint()) sentences = sentence_splitter.tokenize(text) return sentences
def keyword_sentiment(): ## take in tht input word = sys.argv[1] date_diff = int(sys.argv[2]) ## create a sentence_tokenizer from nltk.tokenize.punkt import PunktSentenceTokenizer, PunktParameters punkt_param = PunktParameters() punkt_param.abbrev_types = set(['dr', 'vs', 'mr', 'mrs', 'prof', 'inc','1','2','3','4','5','6','7','8','9','10','11','12','13','14','15','16','17','18','19','20']) sent_tokenizer = PunktSentenceTokenizer(punkt_param) ## caluclate the barrier date DD = datetime.timedelta(days=date_diff) barrier_date = datetime.datetime.now()- DD ## make connection to db and fetch tweets (and respective sentiment) above the barrier_date db = MySQLdb.connect(host="localhost",user="******",passwd="{2qGq(22+5iU",db="Insights") cur = db.cursor() sql = "SELECT Phrase,Sentiment FROM Phrases WHERE `Date`>'"+str(barrier_date)+"';" cur.execute(sql) total_sentiment = 0 total_count = 0 ## locate tweets which contain keyword, tokenize them into sentences for row in cur.fetchall(): if(row[0].lower().find(word.lower())!=-1): sentences = sent_tokenizer.tokenize(row[0]) ## if a single sentence then just take the sentiment from db if len(sentences) == 1: total_sentiment = total_sentiment + float(row[1]) total_count = total_count+1 ## else add together sentiment of sentence and keep the count else: for sentence in sentences: blob = TextBlob(sentence) total_sentiment= total_sentiment + int(blob.sentiment.polarity*1000)/1000.0 if(sentence.lower().find(word.lower())!=-1): total_count = total_count+1 ## json the total_sentiment/count and count if(total_count!=0): json_array = json_array = [{"sentiment": int(total_sentiment/total_count*1000)/1000.0, "count": total_count}] else: json_array = json_array = [{"sentiment": 0, "count": 0}] ## close the connection to the db db.close() ## print the json print(json.dumps(json_array))
def split_sentences(text): """Divides text into sentences. Return list of sentences. :param text: :return: list of sentences """ from nltk.tokenize.punkt import PunktSentenceTokenizer, PunktParameters punkt_param = PunktParameters() # TODO: buraya türkçe kısaltmalar eklenebilir. Cuneyd hocanın dediği dr. tarzı kısaltmalar punkt_param.abbrev_types = set(['dr', 'vs', 'mr', 'mrs', 'prof', 'inc']) sentence_splitter = PunktSentenceTokenizer(punkt_param) sentences = sentence_splitter.tokenize(text) return sentences
def splitIntoSentences2(file_name): punkt_param = PunktParameters() punkt_param.abbrev_types = set(['dr', 'vs', 'mr', 'mrs', 'prof', 'inc']) sentence_splitter = PunktSentenceTokenizer(punkt_param) fp = open(file_name) data = fp.read() data = data.replace('?"', '? "').replace('!"', '! "').replace('."', '. "') sentences = [] for para in data.split('\n'): if para: sentences.extend(sentence_splitter.tokenize(para)) # print '\n-----\n'.join(sentences) return sentences
def loadCorpus(self, path): for encoding in self.__encodings: try: self.__path = path fileName = codecs.open( self.__path,'r', encoding=encoding ) self.__rawText = fileName.read() break except UnicodeDecodeError: encoding = '' continue if encoding!='': self.initFields() #SENTENCES # more abbreviations with dots punkt_param = PunktParameters() punkt_param.abbrev_types = set(['dr', 'vs', 'n', 'v', 'etc', 'art', 'p', 'Cost', 'ss', 'pag']) punkt_param = PunktParameters() sentence_splitter = PunktSentenceTokenizer(punkt_param) text = re.sub(ur'[\'\<\>`’]', ' ', self.__rawText) #text = re.sub('(\d+)', r' \1 ', text) sentences = sentence_splitter.tokenize(text) #TOKENS self.__tokens = [[token, ''] for token in list(itertools.chain(*[ customWordtokenize(sent) for sent in sentences]))] wordTokenizer = RegexpTokenizer('[a-zA-Z0-9\xe0\xe1\xe8\xe9\xec\xed\xf2\xf3\xf9\xfa]+') #wordTokenizer = RegexpTokenizer('[\w]+') sentences = [wordTokenizer.tokenize(sent.lower()) for sent in sentences if len(wordTokenizer.tokenize(sent)) > 0] words = list(itertools.chain(*sentences)) self.__words = words self.__sentences = sentences self.__avgSentLength = round(np.mean( [len(sent) for sent in sentences]), 3) self.__avgWordLength = round(np.mean( [len(word) for word in words]), 3) self.__freqDist = FreqDist(words) self.__wordCount = len(words) self.__lexicalDiversity = round(len(self.__freqDist.items())/float(len(words)), 5) ### resetting members self.__concordanceIndex = None self.__bigrams = None return encoding
def getSentences(paragraph): unicode_data= paragraph.decode("utf-8") data= "".join([i if ord(i) < 128 else "" for i in unicode_data]) tokenizer = nltk.data.load('tokenizers/punkt/english.pickle') punkt_params = PunktParameters() punkt_params.abbrev_types = set(['al',"inc","mr","dr","mrs","prof"]) splitter = PunktSentenceTokenizer(punkt_params) sentences=splitter.tokenize(data) sentences1=filter_list(sentences) ##print sentences1,"\n----------------------------------------------------------------------------" return sentences1
def sentence_splitter(lang): """ :type lang: str :rtype: nltk.tokenize.punkt.PunktSentenceTokenizer """ punkt_param = PunktParameters() path = os.path.dirname(__file__) ab_file = ''.join([path, SUBFOLDER, lang]) if os.path.isfile(ab_file): punkt_param.abbrev_types = set(abbreviation_loader(ab_file)) else: logging.info('Abbreviation file not found for language: %s', lang) splitter = PunktSentenceTokenizer(punkt_param) return splitter
def split_paragraph_into_sentence(text): punkt_param = PunktParameters() abbreviation = [ 'i.e', 'mr', 'st', 'mrs', 'dr', 'ms', 'fig', 'u.s.a', 'a.d', 'a.m', 'cap', 'cf', 'cp', 'c.v', 'al', 'etc', 'e.g', 'ff', 'id', 'i.a', 'i.e', 'lb', 'll.b', 'm.a', 'n.b', 'op.cit', 'p.a', 'ph.d', 'p.m', 'p.p', 'prn', 'pro tem', 'p.s', 'q.d', 'q.e.d', 'q.v', 're', 'reg', 'r.i.p', 's.o.s', 'stat', 'vis', 'vs', 'et al', 'et.al', 'etc', 'e.g', 'i.e', 'eq', 'a.e', 'a.e', 'cf', 'con', 'const', 'fig', 's.t', 'st', '(', ')', '?(' ] punkt_param.abbrev_types = set(abbreviation) tokenizer = PunktSentenceTokenizer(punkt_param) sentences = tokenizer.tokenize(text.lower()) return replace_specieal_characters(sentences)
def bill_sent_chunk_tokenize(bill_text, min_sentence_length=20): punkt_param = PunktParameters() punkt_param.abbrev_types = set([ 'dr', 'vs', 'mr', 'mrs', 'prof', 'inc', '1', '2', '3', '4', '5', '6', '7', '8', '9' ]) sentence_splitter = PunktSentenceTokenizer(punkt_param) bill_sentences = sentence_splitter.tokenize(bill_text) bill_sentences = [ s for s in bill_sentences if len(s) > min_sentence_length ] return bill_sentences
def getSentences(paragraph): unicode_data = paragraph.decode("utf-8") data = "".join([i if ord(i) < 128 else "" for i in unicode_data]) ##tokenizer = nltk.data.load('tokenizers/punkt/english.pickle') punkt_params = PunktParameters() punkt_params.abbrev_types = set( ['al', "inc", "mr", "dr", "mrs", "prof", "etal"]) splitter = PunktSentenceTokenizer(punkt_params) sentences = splitter.tokenize(data) sentences1 = filter_list(sentences) ##print sentences1,"\n----------------------------------------------------------------------------" return sentences1
def __init__(self): self.result = [] punkt_param = PunktParameters() punkt_param.abbrev_types = set(['г', 'гор', 'ул', 'кв', 'д', 'корп', 'эт', 'стр', 'пер', 'просп', 'тел', 'зам', 'каб', 'гос', 'мед']) self.sentence_splitter = PunktSentenceTokenizer(punkt_param) self.command = "./mystem -dig --eng-gr --format json < input.txt > mystem.json" self.verbs = [] #[u"отметил", u"сказал", u"подчеркнул", u"сообщил"] self.auxV_Author_reverse = [] #[u"[Пп]о .... словам", u"[Пп]о данным", u"[Пп]о сообщению", ] self.dividers = {} self.load_verbs() self.dividersF() #делаем разделители речи и автора self.dividersREG = [] self.authID = 1 self.mystem_authors = u"" self.authors = {} self.start = time.clock()
def plagiarismChecker(): text = request.form['text_to_check'] if (text.lstrip().rstrip() == ''): return render_template('input.html') punkt_parameters = PunktParameters() sentence_splitter = PunktSentenceTokenizer(punkt_parameters) sentences = sentence_splitter.tokenize(text) probability_of_plagiarism = 0 for a_sentence in sentences: time.sleep(0.1) content = list(filter(lambda x: x in string.printable, a_sentence)) str1 = ''.join(content) print(str1) # temp=list(content) # print(str(temp)) the_term = urllib.parse.quote('+' + '"' + str1 + '"') page = requests.get('https://www.bing.com/search?q=' + the_term) print(page.url) if ((not "There are no results for" in page.text) and (not "No hay resultados para" in page.text) and (not "are no results for" in page.text)): probability_of_plagiarism += 1 percent_plagiarised = str( (probability_of_plagiarism / len(sentences)) * 100) + '%' return render_template('results.html', text=text, percent_plagiarised=percent_plagiarised)
def filtered_sentences(article, debug=False): #get sentences from the article punkt_param = PunktParameters() punkt_param.abbrev_types = set(['dr', 'vs', 'mr', 'mrs', 'prof', 'inc', 'e.g', 'i.e']) sentence_splitter = PunktSentenceTokenizer(punkt_param) sentences = sentence_splitter.tokenize(article.strip()) #tokenize all of the sentences sentences = [(nltk.word_tokenize(sentence), sentence) for sentence in sentences] #throw out sentences with no linking verb sentences = filter(short_enough, sentences) sentences = filter(next(all_linking(sentence), None), sentences) #pos tag the remaining sentences sentences = [sentence_to_features(sentence) for sentence in sentences] #filter(good_enough, sentences) sorted(sentences, key=goodness) return [sentence[1] for sentence in sentences]
def sentence_tokenizer(text): """ Tokenizes sentences. :param text: :return: list of sentences (a sentence is a string) """ punkt_param = PunktParameters() punkt_param.abbrev_types = { 'zzgl', 'prof', 'ca', 'vj', 't', 'mio', 'sro', 'lv', 'io', 'ihv', 'bzw', 'usw', 'inkl', 'zt', 'vh', 'dr', 'entspr', 'dem', 'fort', 'co', 'kg', 'zb', 'bspw', 'ua', 'rd', 'abs', 'etc', 'tsd', 'z.b', 'evtl', '1', '2', '3', '4', '5', '6', '7', '8', '9', '19', '20', '21' } sentence_splitter = PunktSentenceTokenizer(punkt_param) return sentence_splitter.tokenize(text)
def split_into_sentences(text): # splits the text into sentences and also preserves the corresponding starting and ending indices startIndices = [] endIndices = [] corpus = [] punkt_param = PunktParameters() punkt_param.abbrev_types = set( ['dr', 'doc', 'mr', 'mrs', 'prof', 'inc', 'mgr', 'ing', 'st']) sentence_splitter = PunktSentenceTokenizer(punkt_param) for start, end in sentence_splitter.span_tokenize(text): startIndices.append(start) endIndices.append(end) token = text[start:end] corpus.append(token) return startIndices, endIndices, corpus
def myNLTKParser(document, tagger): lexical_diversity = len(document) / len(set(document)) * 1.0 punkt_param = PunktParameters() # if any customized abbrev #punkt_param.abbrev_types = set(['dr', 'vs', 'mr', 'mrs', 'prof', 'inc']) # tokenize to sentence sentence_splitter = PunktSentenceTokenizer(punkt_param) sentences = sentence_splitter.tokenize(document.replace('\'s', '_s')) # tokenize sentence to words word_tokens = [[ w.strip() for w in nltk.word_tokenize(s) if not w.strip().lower() in stopwords ] for s in sentences] # extend token to bigram and trigram extended_tokens = [] for token_list in word_tokens: extended_tokens.append(token_list + nltk.bigrams(token_list) + nltk.trigrams(token_list)) # word stemmer to normalize p_stemmer = PorterStemmer() stem_tokens = [] for token_list in word_tokens: stem_tokens.append([p_stemmer.stem(w) for w in token_list]) # POS tags tags = [tagger.tag(a) for a in extended_tokens] tags_of_verbs = ['NN', 'VB', 'VBP', 'VBG'] tags_of_interest = [ 'JJ', 'JJR', 'JJS', 'NN', 'NNP', 'NNPS', 'NNS', 'RB', 'RBR', 'RBS' ] tags_of_noun = ['NN'] merged_tags_uni = [ word for sublist in tags for (word, tag) in sublist if tag in tags_of_verbs and isinstance(word, tuple) == False ] merged_tags_bi = [ word for sublist in tags for (word, tag) in sublist if tag in tags_of_interest and isinstance(word, tuple) and len(word) == 2 ] merged_tags_tri = [ word for sublist in tags for (word, tag) in sublist if tag in tags_of_interest and isinstance(word, tuple) and len(word) == 3 ] uni_tags_fd = nltk.FreqDist(merged_tags_uni) bi_tags_fd = nltk.FreqDist(merged_tags_bi) tri_tags_fd = nltk.FreqDist(merged_tags_tri) return { 'uni_fd': uni_tags_fd.max(), 'bi_fd': bi_tags_fd.max(), 'tri_fd': tri_tags_fd.max(), }
def get_important_sent(html_content): punkt_param = PunktParameters() punkt_param.abbrev_types = set([ 'dr', 'vs', 'mr', 'mrs', 'miss', 'prof', 'inc', 'no', 'cap', 'nos', 'vol', 'para', 'exh' ]) tokenizer = nltk.PunktSentenceTokenizer(punkt_param) soup = BeautifulSoup(html_content, 'html.parser') content = soup.get_text() paras = get_paras(content) sents = [] for para in paras: para_content = content[para[0]:para[1] + 1] for sent in tokenizer.span_tokenize(para_content): sents.append(para_content[sent[0]:sent[1] + 1]) sents = np.array(sents) BertTokenizer = bert.bert_tokenization.FullTokenizer(VOCAB_FILE, do_lower_case=True) input_ids, input_mask, segment_ids = convert_all_sentences( clean_data(sents), BertTokenizer) model = tf.keras.models.load_model("bert_model") input_X = { "input_ids": input_ids, "input_mask": input_mask, "segment_ids": segment_ids } sents = sents[(model.predict(input_X, batch_size=1) > 0.4).reshape(-1, )] for sent in sents: segs = filter(lambda seg: seg != "", sent.split("\n")) for seg in segs: seg = seg.replace("\xa0", " ") while seg: cur = len(seg) while True: if not cur: return html_content cur_str = seg[:cur] res = html_content.find(cur_str) if res == -1: cur -= 1 else: html_content = add_important_class( html_content, res, res + len(cur_str)) seg = seg[cur:] break return html_content
def tokenize_sentences(self, text, word_threshold=5): """ Returns a list of sentences given an input string of text. :param text: input string :param word_threshold: number of significant words that a sentence must contain to be counted (to count all sentences set equal to 1; 5 by default) :return: list of sentences """ punkt_params = PunktParameters() # Not using set literal to allow compatibility with Python 2.6 punkt_params.abbrev_types = set([ 'dr', 'vs', 'mr', 'mrs', 'ms', 'prof', 'mt', 'inc', 'i.e', 'e.g' ]) sentence_splitter = PunktSentenceTokenizer(punkt_params) # 1. TOKENIZE "UNPROCESSED" SENTENCES FOR DISPLAY # Need to adjust quotations for correct sentence splitting text_unprocessed = text.replace('?"', '? "').replace('!"', '! "').replace('."', '. "') # Treat line breaks as end of sentence (needed in cases where titles don't have a full stop) text_unprocessed = text_unprocessed.replace('\n', ' . ') # Perform sentence splitting unprocessed_sentences = sentence_splitter.tokenize(text_unprocessed) # Now that sentences have been split we can return them back to their normal formatting for ndx, sentence in enumerate(unprocessed_sentences): sentence = unicode_to_ascii(sentence) # Sentence splitter returns unicode strings sentence = sentence.replace('? " ', '?" ').replace('! " ', '!" ').replace('. " ', '." ') sentence = self._remove_whitespace(sentence) # Remove excess whitespace sentence = sentence[:-2] if (sentence.endswith(' .') or sentence.endswith(' . ')) else sentence unprocessed_sentences[ndx] = sentence # 2. PROCESS THE SENTENCES TO PERFORM STEMMING, STOPWORDS REMOVAL ETC. FOR MATRIX COMPUTATION processed_sentences = [self.sanitize_text(sen) for sen in unprocessed_sentences] # Sentences should contain at least 'word_threshold' significant terms filter_sentences = [i for i in range(len(processed_sentences)) if len(processed_sentences[i].replace('.', '').split(' ')) > word_threshold] processed_sentences = [processed_sentences[i] for i in filter_sentences] unprocessed_sentences = [unprocessed_sentences[i] for i in filter_sentences] return processed_sentences, unprocessed_sentences
def sent_tokenize(text, abbrev_list=['dr', 'vs', 'etc', 'mr', 'mrs', 'prof', 'inc', 'et', 'al', 'Fig', 'fig']): ''' Tokenizes a string into sentences Args: text(str) -- The text being tokenized abbrev_list(list) -- a list of abbreviations followed by dot to exclude from tokinzation e.g. mr. ms. etc. Returns list of strings -- list of sentences ''' punkt_param = PunktParameters() punkt_param.abbrev_types = set(abbrev_list) sent_detector = PunktSentenceTokenizer(punkt_param) return sent_detector.tokenize(text)
def sent_tokenize(data, filter_threshold=None): ''' Tokenizes a string into sentences and corresponding offsets Args: data(str): The document itself filter_threshold(int): if sentence length is less than this, it will be ignored Returns: tuple(list(str), list(list))): tokenized sentences and corresponding offsets ''' punkt_param = PunktParameters() punkt_param.abbrev_types = set( ['dr', 'vs', 'mr', 'mrs', 'prof', 'inc', 'et', 'al', 'Fig', 'fig']) sent_detector = PunktSentenceTokenizer(punkt_param) sentences = sent_detector.tokenize(data) offsets = sent_detector.span_tokenize(data) return (sentences, offsets)
def ari(raw): # tokenize raw text and get words tokens = nltk.wordpunct_tokenize(raw) words = [word.lower() for word in tokens if word.isalpha()] # instantiate punctuation parameters punkt_params = PunktParameters() # specify abbreviations to be ignored in sentence separation punkt_params.abbrev_types = set(['dr', 'inc', 'mr', 'mrs', 'ms', 'prof', 'etc']) # separate into sentences using a PuktSentenceTokenizer sentences = PunktSentenceTokenizer(punkt_params).tokenize(raw) chars = 0 for word in words: chars += len(word) return (4.71 * (chars / len(words)) + 0.5 * (len(words) / len(sentences)) - 21.43)
def summarize(body, pmid): if not body: return('No summary avialable for PMID %d' % pmid) punkt_param = PunktParameters() punkt_param.abbrev_types = set(['et al', 'i.e', 'e.g', 'ref', 'c.f', 'fig', 'Fig', 'Eq', 'eq', 'eqn', 'Eqn']) sentence_splitter = PunktSentenceTokenizer(punkt_param) sentences = sentence_splitter.tokenize(body) tagged = [] for sentence in sentences: tagged.append(bigram_tagger.tag(sentence.split())) summary = [] for sentence in tagged: for (word, tag) in sentence: if tag == 'PPSS' and word.lower() == 'we': summary.append(' '.join(nltk.tag.untag(sentence))) return summary
# -*- coding: utf-8 -*- """ Created on Thu Jun 16 09:41:46 2016 @author: U505118 """ import pandas as pd import re import unidecode import nltk from nltk.tokenize.punkt import PunktSentenceTokenizer from nltk.tokenize.punkt import PunktParameters punkt = PunktParameters() punkt.abbrev_types = ['u.s.a', 'ltd', 'inc', 'no'] sen = PunktSentenceTokenizer(punkt) for k in range(21, 87, 3): df = pd.read_csv('C:/Users/U505118/Desktop/P/10_K/outt'+str(k)+'01.csv') textl = [] i = 0 index = [] for x in df['fact']: if type(x) is str: if len(x) > 100: #x = unidecode.unidecode(x) x = re.sub(r'<.*?>', ' ', x)
Author: Michael J Bommarito II <*****@*****.**> Date: 2014-05-24 """ # NLTK imports from nltk.corpus import stopwords from nltk.tokenize.punkt import PunktSentenceTokenizer, PunktWordTokenizer, \ PunktParameters # Import stopword list english_stopwords = stopwords.words('english') # Customizer sentence tokenizer punkt_param = PunktParameters() punkt_param.abbrev_types = [x.lower().strip() for x in \ set(['id', 'al', 'mr', 'mrs', 'prof', 'inc', 'llc', 'co', 'llp', 'pp', 'f', 'app', '2d', '3d', 'ch', 's', 'us', 'cert', 'rev', 'i', 'ii', 'iii', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', ')', 'no', 'cir', 'ca', 'c.a', 'fed', 'sec', 'jan', 'feb', 'mar', 'apr', 'jun', 'jul', 'aug', 'sep', 'oct', 'nov', 'dec', 'ala', 'vt', 'st', 'u.s',
from cltk.tokenize.sentence import TokenizeSentence from cltk.tokenize.word import WordTokenizer # Would like to have this search through a CLTK_DATA environment variable # Better to use something like make_cltk_path in cltk.utils.file_operations? home = os.path.expanduser('~') cltk_path = os.path.join(home, 'cltk_data') if not os.path.isdir(cltk_path): os.makedirs(cltk_path) word_tokenizer = WordTokenizer('latin') if os.path.exists(cltk_path + 'latin/model/latin_models_cltk/tokenizers/sentence'): sent_tokenizer = TokenizeSentence('latin') else: punkt_param = PunktParameters() abbreviations = ['c', 'l', 'm', 'p', 'q', 't', 'ti', 'sex', 'a', 'd', 'cn', 'sp', "m'", 'ser', 'ap', 'n', 'v', 'k', 'mam', 'post', 'f', 'oct', 'opet', 'paul', 'pro', 'sert', 'st', 'sta', 'v', 'vol', 'vop'] punkt_param.abbrev_types = set(abbreviations) sent_tokenizer = PunktSentenceTokenizer(punkt_param) # Latin Library try: latinlibrary = PlaintextCorpusReader(cltk_path + '/latin/text/latin_text_latin_library', '.*\.txt', word_tokenizer=word_tokenizer, sent_tokenizer=sent_tokenizer, encoding='utf-8') pass except IOError as e: pass # print("Corpus not found. Please check that the Latin Library is installed in CLTK_DATA.")
def tokenize_latin_words(string): from cltk.tokenize.latin_exceptions import latin_exceptions assert isinstance(string, str), "Incoming string must be type str." def matchcase(word): # From Python Cookbook def replace(m): text = m.group() if text.isupper(): return word.upper() elif text.islower(): return word.lower() elif text[0].isupper(): return word.capitalize() else: return word return replace replacements = [(r'mecum', 'cum me'), (r'tecum', 'cum te'), (r'secum', 'cum se'), (r'nobiscum', 'cum nobis'), (r'vobiscum', 'cum vobis'), (r'quocum', 'cum quo'), (r'quacum', 'cum qua'), (r'quicum', 'cum qui'), (r'quibuscum', 'cum quibus'), (r'sodes', 'si audes'), (r'satin', 'satis ne'), (r'scin', 'scis ne'), (r'sultis', 'si vultis'), (r'similist', 'similis est'), (r'qualist', 'qualis est') ] for replacement in replacements: string = re.sub(replacement[0], matchcase(replacement[1]), string, flags=re.IGNORECASE) punkt_param = PunktParameters() abbreviations = ['c', 'l', 'm', 'p', 'q', 't', 'ti', 'sex', 'a', 'd', 'cn', 'sp', "m'", 'ser', 'ap', 'n', 'v', 'k', 'mam', 'post', 'f', 'oct', 'opet', 'paul', 'pro', 'sert', 'st', 'sta', 'v', 'vol', 'vop'] punkt_param.abbrev_types = set(abbreviations) sent_tokenizer = PunktSentenceTokenizer(punkt_param) word_tokenizer = PunktLanguageVars() sents = sent_tokenizer.tokenize(string) enclitics = ['que', 'n', 'ue', 've', 'st'] exceptions = enclitics exceptions = list(set(exceptions + latin_exceptions)) tokens = [] for sent in sents: temp_tokens = word_tokenizer.word_tokenize(sent) if temp_tokens[0].endswith('ne'): if temp_tokens[0].lower() not in exceptions: temp = [temp_tokens[0][:-2], '-ne'] temp_tokens = temp + temp_tokens[1:] if temp_tokens[-1].endswith('.'): final_word = temp_tokens[-1][:-1] del temp_tokens[-1] temp_tokens += [final_word, '.'] for token in temp_tokens: tokens.append(token) # Break enclitic handling into own function? specific_tokens = [] for token in tokens: is_enclitic = False if token.lower() not in exceptions: for enclitic in enclitics: if token.endswith(enclitic): if enclitic == 'n': specific_tokens += [token[:-len(enclitic)]] + ['-ne'] elif enclitic == 'st': if token.endswith('ust'): specific_tokens += [token[:-len(enclitic) + 1]] + ['est'] else: specific_tokens += [token[:-len(enclitic)]] + ['est'] else: specific_tokens += [token[:-len(enclitic)]] + ['-' + enclitic] is_enclitic = True break if not is_enclitic: specific_tokens.append(token) return specific_tokens
def tokenize_latin_words(string): """ Tokenizer divides the string into a list of substrings >>> from cltk.corpus.utils.formatter import remove_non_ascii >>> text = 'Dices ἐστιν ἐμός pulchrum esse inimicos ulcisci.' >>> remove_non_ascii(text) >>> 'Dices pulchrum esse inimicos ulcisci. :param string: This accepts the string value that needs to be tokenized :returns: A list of substrings extracted from the string """ from cltk.tokenize.latin_exceptions import latin_exceptions assert isinstance(string, str), "Incoming string must be type str." def matchcase(word): # From Python Cookbook def replace(m): text = m.group() if text.isupper(): return word.upper() elif text.islower(): return word.lower() elif text[0].isupper(): return word.capitalize() else: return word return replace replacements = [(r'mecum', 'cum me'), (r'tecum', 'cum te'), (r'secum', 'cum se'), (r'nobiscum', 'cum nobis'), (r'vobiscum', 'cum vobis'), (r'quocum', 'cum quo'), (r'quacum', 'cum qua'), (r'quicum', 'cum qui'), (r'quibuscum', 'cum quibus'), (r'sodes', 'si audes'), (r'satin', 'satis ne'), (r'scin', 'scis ne'), (r'sultis', 'si vultis'), (r'similist', 'similis est'), (r'qualist', 'qualis est') ] for replacement in replacements: string = re.sub(replacement[0], matchcase(replacement[1]), string, flags=re.IGNORECASE) punkt_param = PunktParameters() abbreviations = ['c', 'l', 'm', 'p', 'q', 't', 'ti', 'sex', 'a', 'd', 'cn', 'sp', "m'", 'ser', 'ap', 'n', 'v', 'k', 'mam', 'post', 'f', 'oct', 'opet', 'paul', 'pro', 'sert', 'st', 'sta', 'v', 'vol', 'vop'] punkt_param.abbrev_types = set(abbreviations) sent_tokenizer = PunktSentenceTokenizer(punkt_param) word_tokenizer = PunktLanguageVars() sents = sent_tokenizer.tokenize(string) enclitics = ['que', 'n', 'ue', 've', 'st'] exceptions = enclitics exceptions = list(set(exceptions + latin_exceptions)) tokens = [] for sent in sents: temp_tokens = word_tokenizer.word_tokenize(sent) # Need to check that tokens exist before handling them; needed to make stream.readlines work in PlaintextCorpusReader if temp_tokens: if temp_tokens[0].endswith('ne'): if temp_tokens[0].lower() not in exceptions: temp = [temp_tokens[0][:-2], '-ne'] temp_tokens = temp + temp_tokens[1:] if temp_tokens[-1].endswith('.'): final_word = temp_tokens[-1][:-1] del temp_tokens[-1] temp_tokens += [final_word, '.'] for token in temp_tokens: tokens.append(token) # Break enclitic handling into own function? specific_tokens = [] for token in tokens: is_enclitic = False if token.lower() not in exceptions: for enclitic in enclitics: if token.endswith(enclitic): if enclitic == 'n': specific_tokens += [token[:-len(enclitic)]] + ['-ne'] elif enclitic == 'st': if token.endswith('ust'): specific_tokens += [token[:-len(enclitic) + 1]] + ['est'] else: specific_tokens += [token[:-len(enclitic)]] + ['est'] else: specific_tokens += [token[:-len(enclitic)]] + ['-' + enclitic] is_enclitic = True break if not is_enclitic: specific_tokens.append(token) return specific_tokens
def sent_tokenize(data): punkt_param = PunktParameters() punkt_param.abbrev_types = set( ['dr', 'vs', 'mr', 'mrs', 'prof', 'inc', 'et', 'al', 'Fig', 'fig']) sent_detector = PunktSentenceTokenizer(punkt_param) sentences = sent_detector.tokenize(data) offsets = sent_detector.span_tokenize(data) new_sentences = deepcopy(sentences) new_offsets = deepcopy(offsets) for i, off in enumerate(offsets): if len(tokenizer.tokenize(sentences[i])) < 7: # Skip short sentences pass else: if i < len(offsets) - 1: if ((offsets[i + 1][0] - offsets[i][1]) < 5): new_sentences.append(sentences[i] + ' ' + sentences[i + 1]) new_offsets.append((offsets[i][0], offsets[i + 1][1])) if i < len(offsets) - 2: if ((offsets[i + 2][0] - offsets[i + 1][1]) < 5) and\ ((offsets[i + 1][0] - offsets[i][0]) < 5): new_sentences.append( sentences[i] + ' ' + sentences[i + 1] + ' ' + sentences[i + 2]) new_offsets.append((offsets[i][0], offsets[i + 2][1])) # if i < len(offsets) - 3: # if (((offsets[i + 3][0] - offsets[i + 2][1]) < 5) and # ((offsets[i + 2][0] - offsets[i + 1][0]) < 5) and # ((offsets[i + 1][0] - offsets[i][0]) < 5)): # new_sentences.append(sentences[ # i] + ' ' + sentences[i + 1] + ' ' + sentences[i + 2] + ' ' + sentences[i + 3]) # new_offsets.append((offsets[i][0], offsets[i + 3][1])) # if i < len(offsets) - 4: # if (((offsets[i + 4][0] - offsets[i + 3][1]) < 5) and # ((offsets[i + 3][0] - offsets[i + 2][1]) < 5) and # ((offsets[i + 2][0] - offsets[i + 1][0]) < 5) and # ((offsets[i + 1][0] - offsets[i][0]) < 5)): # if i < len(offsets) - 3: # if (((offsets[i + 3][0] - offsets[i + 2][1]) < 5) and # ((offsets[i + 2][0] - offsets[i + 1][0]) < 5) and # ((offsets[i + 1][0] - offsets[i][0]) < 5)): # new_sentences.append(sentences[ # i] + ' ' + sentences[i + 1] + ' ' + sentences[i + 2] + ' ' + sentences[i + 3]) # new_offsets.append((offsets[i][0], offsets[i + 3][1])) # if i < len(offsets) - 4: # if (((offsets[i + 4][0] - offsets[i + 3][1]) < 5) and # ((offsets[i + 3][0] - offsets[i + 2][1]) < 5) and # ((offsets[i + 2][0] - offsets[i + 1][0]) < 5) and # ((offsets[i + 1][0] - offsets[i][0]) < 5)): # new_sentences.append(sentences[ # i] + ' ' + sentences[i + 1] + ' ' + sentences[i + 2] + ' ' + sentences[i + 3] + ' ' + sentences[i + 3]) # new_offsets.append((offsets[i][0], offsets[i + 3][1])) # if i < len(offsets) - 5: # if (((offsets[i + 5][0] - offsets[i + 4][1]) < 5) and # ((offsets[i + 4][0] - offsets[i + 3][1]) < 5) and # ((offsets[i + 3][0] - offsets[i + 2][1]) < 5) and # ((offsets[i + 2][0] - offsets[i + 1][0]) < 5) and # ((offsets[i + 1][0] - offsets[i][0]) < 5)): # new_sentences.append(sentences[ # i] + ' ' + sentences[i + 1] + ' ' + sentences[i + 2] + ' ' + sentences[i + 3] + ' ' + sentences[i + 3]) # new_offsets.append((offsets[i][0], offsets[i + 3][1])) new_sentences.append(sentences[ # i] + ' ' + sentences[i + 1] + ' ' + sentences[i + 2] + ' ' + sentences[i + 3] + ' ' + sentences[i + 3]) # new_offsets.append((offsets[i][0], offsets[i + 3][1])) # if i < len(offsets) - 5: # if (((offsets[i + 5][0] - offsets[i + 4][1]) < 5) and # ((offsets[i + 4][0] - offsets[i + 3][1]) < 5) and # ((offsets[i + 3][0] - offsets[i + 2][1]) < 5) and # ((offsets[i + 2][0] - offsets[i + 1][0]) < 5) and # ((offsets[i + 1][0] - offsets[i][0]) < 5)): # new_sentences.append(sentences[ # i] + ' ' + sentences[i + 1] + ' ' + sentences[i + 2] + ' ' + sentences[i + 3] + ' ' + sentences[i + 3]) # new_offsets.append((offsets[i][0], offsets[i + 3][1])) print new_offsets return {'sentences': new_sentences, 'offsets': new_offsets}
def sent_tokenize(data, filter_short=False, filter_verbless=False): """ Tokenize sentences Tokenize `data` into two arrays: sentences and offsets Returns a tuple (`sentences`,`offsets`) """ punkt_param = PunktParameters() punkt_param.abbrev_types = set( ['dr', 'vs', 'mr', 'mrs', 'prof', 'inc', 'et', 'al', 'Fig', 'fig']) sent_detector = PunktSentenceTokenizer(punkt_param) sentences = sent_detector.tokenize(data) offsets = sent_detector.span_tokenize(data) new_sentences = [] new_offsets = [] to_del = [] if filter_verbless: pos = pos_tagger.extract_nlp_batch() for i in range(sentences): okay = False for word in pos['sentences'][i]['words']: if word[1]['PartOfSpeech'] in verbs: okay = True break if not okay: # the sentence doesn't have verb, to_del.append(i) # mark for deletion sentences = multi_delete(sentences, to_del) offsetes = multi_delete(offsets, to_del) if filter_short and not filter_verbless: for i in range(len(sentences)): if len(sentences[i]) >= filter_short: new_sentences.append(sentences[i]) new_offsets.append(new_offsets[i]) new_sentences = [s for s in sentences if sentences] # new_sentences = deepcopy(sentences) # new_offsets = deepcopy(offsets) # for i, off in enumerate(offsets): # if i < len(offsets) - 1: # if ((offsets[i + 1][0] - offsets[i][1]) < 5): # new_sentences.append(sentences[i] + ' ' + sentences[i + 1]) # new_offsets.append((offsets[i][0], offsets[i + 1][1])) # if i < len(offsets) - 2: # if ((offsets[i + 2][0] - offsets[i + 1][1]) < 5) and\ # ((offsets[i + 1][0] - offsets[i][0]) < 5): # new_sentences.append( # sentences[i] + ' ' + sentences[i + 1] + ' ' + sentences[i + 2]) # new_offsets.append((offsets[i][0], offsets[i + 2][1])) # if i < len(offsets) - 3: # if (((offsets[i + 3][0] - offsets[i + 2][1]) < 5) and # ((offsets[i + 2][0] - offsets[i + 1][0]) < 5) and # ((offsets[i + 1][0] - offsets[i][0]) < 5)): # new_sentences.append(sentences[ # i] + ' ' + sentences[i + 1] + ' ' + sentences[i + 2] + ' ' + sentences[i + 3]) # new_offsets.append((offsets[i][0], offsets[i + 3][1])) # if i < len(offsets) - 4: # if (((offsets[i + 4][0] - offsets[i + 3][1]) < 5) and # ((offsets[i + 3][0] - offsets[i + 2][1]) < 5) and # ((offsets[i + 2][0] - offsets[i + 1][0]) < 5) and # ((offsets[i + 1][0] - offsets[i][0]) < 5)): # if i < len(offsets) - 3: # if (((offsets[i + 3][0] - offsets[i + 2][1]) < 5) and # ((offsets[i + 2][0] - offsets[i + 1][0]) < 5) and # ((offsets[i + 1][0] - offsets[i][0]) < 5)): # new_sentences.append(sentences[ # i] + ' ' + sentences[i + 1] + ' ' + sentences[i + 2] + ' ' + sentences[i + 3]) # new_offsets.append((offsets[i][0], offsets[i + 3][1])) # if i < len(offsets) - 4: # if (((offsets[i + 4][0] - offsets[i + 3][1]) < 5) and # ((offsets[i + 3][0] - offsets[i + 2][1]) < 5) and # ((offsets[i + 2][0] - offsets[i + 1][0]) < 5) and # ((offsets[i + 1][0] - offsets[i][0]) < 5)): # new_sentences.append(sentences[ # i] + ' ' + sentences[i + 1] + ' ' + sentences[i + 2] + ' ' + sentences[i + 3] + ' ' + sentences[i + 3]) # new_offsets.append((offsets[i][0], offsets[i + 3][1])) # if i < len(offsets) - 5: # if (((offsets[i + 5][0] - offsets[i + 4][1]) < 5) and # ((offsets[i + 4][0] - offsets[i + 3][1]) < 5) and # ((offsets[i + 3][0] - offsets[i + 2][1]) < 5) and # ((offsets[i + 2][0] - offsets[i + 1][0]) < 5) and # ((offsets[i + 1][0] - offsets[i][0]) < 5)): # new_sentences.append(sentences[ # i] + ' ' + sentences[i + 1] + ' ' + sentences[i + 2] + ' ' + sentences[i + 3] + ' ' + sentences[i + 3]) # new_offsets.append((offsets[i][0], offsets[i + 3][1])) new_sentences.append(sentences[ # i] + ' ' + sentences[i + 1] + ' ' + sentences[i + 2] + ' ' + sentences[i + 3] + ' ' + sentences[i + 3]) # new_offsets.append((offsets[i][0], offsets[i + 3][1])) # if i < len(offsets) - 5: # if (((offsets[i + 5][0] - offsets[i + 4][1]) < 5) and # ((offsets[i + 4][0] - offsets[i + 3][1]) < 5) and # ((offsets[i + 3][0] - offsets[i + 2][1]) < 5) and # ((offsets[i + 2][0] - offsets[i + 1][0]) < 5) and # ((offsets[i + 1][0] - offsets[i][0]) < 5)): # new_sentences.append(sentences[ # i] + ' ' + sentences[i + 1] + ' ' + sentences[i + 2] + ' ' + sentences[i + 3] + ' ' + sentences[i + 3]) # new_offsets.append((offsets[i][0], offsets[i + 3][1])) print new_offsets return {'sentences': new_sentences, 'offsets': new_offsets}
def gather_input(): #gather input for file in os.listdir("../scrapper/"): if file.endswith(".txt"): inputFile = file file = open("../scrapper/"+inputFile,"r") input = file.read() file.close() #os.remove("../scrapper/"+inputFile) #extract text reg_string=ur"\"text\":\"(.+?)[^\\]\"" data_array=re.findall(reg_string,input) #extract location of tweet reg_string=ur"\"location\":\"(.*?)\"" location_array=re.findall(reg_string,input) #extract whether retweeted or not reg_string = ur"\"retweeted\":(.+?)," retweet_bool=re.findall(reg_string,input) #today's date in YYYYMMDD format date = datetime.datetime.now() date = date.date() #date = date.strftime("%Y%m%d") ## calcualte the barrier date date_diff = int(sys.argv[1]) DD = datetime.timedelta(days=date_diff) barrier_date = (datetime.datetime.now()- DD).date() ## load the whitelist and create array of arrays as - [noun,sentiment,count] file = open("../py_code/white_list.txt","r") white_list = [] line = file.readline() while line: white_list.append([line.rstrip(),0,0]) line = file.readline() file.close() ## create a sentence_tokenizer from nltk.tokenize.punkt import PunktSentenceTokenizer, PunktParameters punkt_param = PunktParameters() punkt_param.abbrev_types = set(['dr', 'vs', 'mr', 'mrs', 'prof', 'inc','1','2','3','4','5','6','7','8','9','10','11','12','13','14','15','16','17','18','19','20']) sent_tokenizer = PunktSentenceTokenizer(punkt_param) ## next step is to inject into the database db = MySQLdb.connect(host="localhost",user="******",passwd="{2qGq(22+5iU",db="Insights") cur = db.cursor() ##filter out those tweet which have prices in them - usually sales, or retweets i=0 for text in data_array: if retweet_bool[i]!="false": pass else: ##filter text as many users dont put space after full stop - which is essential to use sentence tokenizer data_array[i] = re.sub(r'([\.\?\!])(\w)', r'\1 \2', data_array[i]) blob = TextBlob(data_array[i]) blob_sentiment = int(blob.sentiment.polarity*1000)/1000.0 sql = "INSERT INTO Phrases(Phrase,Sentiment,Location,Date) VALUES (\""+data_array[i]+"\", "+str(blob_sentiment)+", \""+location_array[i]+"\", \""+str(date)+"\")" cur.execute(sql) ## tokenize the tweets, for sentiment analysis sentences = sent_tokenizer.tokenize(data_array[i]) if len(sentences) == 1: ##run through the whiteList array, for each find count, add count, sentiment to array for word in white_list: if((sentences[0].lower()).find(word[0])!=-1): word[1]=word[1]+blob_sentiment word[2]=word[2]+1 else: for sentence in sentences: ##run through the whiteList array, for each find count and sentiment, add count, sentiment to array for word in white_list: if((sentence.lower()).find(word[0])!=-1): blob = TextBlob(sentence) word[1]=word[1]+int(blob.sentiment.polarity*1000)/1000.0 word[2]=word[2]+1 i=i+1 db.commit() ### now integerate these into Sentiment db, if there is no entry for today insert phrase and create one sql = "SELECT * FROM Sentiment WHERE `Date` ='"+str(date)+"' LIMIT 1;" cur.execute(sql) if(cur.rowcount==0): for word in white_list: if(word[2]!=0): sql = "INSERT INTO Sentiment VALUES ('"+str(date)+"','"+word[0]+"','"+str(word[1])+"','"+str(word[2])+"');" cur.execute(sql) ### else get the entry in the table, add sentiment and count, store back else: for word in white_list: if(word[2]!=0): sql = "SELECT Sentiment,Count FROM Sentiment WHERE `Date` ='"+str(date)+"'AND `Phrase`='"+word[0]+"';" cur.execute(sql) for row in cur.fetchall(): new_sentiment = float(row[0])+word[1] new_count = row[1]+word[2] sql = "UPDATE Sentiment SET `Sentiment`="+str(new_sentiment)+",`Count`="+str(new_count)+" WHERE `Date` ='"+str(date)+"'AND `Phrase`='"+word[0]+"';" cur.execute(sql) db.commit() ### now add all the sentiment and count for all phrases in the white list in the Sentiment db above the barrier_date, add to json those whose count is not zero total_sentiment = 0; total_count = 0; json_array = []; for word in white_list: sql = "SELECT Sentiment,Count FROM Sentiment WHERE `Date` >'"+str(barrier_date)+"'AND `Phrase`='"+word[0]+"';" cur.execute(sql) if(cur.rowcount!=0): for row in cur.fetchall(): total_sentiment = total_sentiment+float(row[0]) total_count = total_count+int(row[1]) json_array.append({"noun": word[0], "sentiment": int(total_sentiment/total_count*1000)/1000.0, "count": total_count}) total_sentiment = 0; total_count = 0; db.close() print(json.dumps(json_array))
#!/usr/bin/env python """TAGGING INSTRUCTIONS If you don't know what to tag it with, reject with 'n' or just give the generic 'yn' tag! But seriously, 'yn' is a generic tag, since pretty much everything can be a polar question. If you have any other tags, please don't put the 'yn' tag. Just use it if you don't want to reject a sentence but don't have an alternative tag. You can use multiple tags for one sentence! just separate them with spaces on the same line. If you see a shitty "sentence", just 'n' it. punkt the tokenizer sucks.""" from nltk.tokenize.punkt import PunktSentenceTokenizer, PunktParameters import sys punkt_param = PunktParameters() punkt_param.abbrev_types = set(["dr", "vs", "mr", "mrs", "prof", "inc", "v"]) sentence_splitter = PunktSentenceTokenizer(punkt_param) valid_tags = frozenset(["who", "what", "when", "where", "how", "why", "yn"]) def output_tags(f, sentence, tags): for tag in tags: f.write(sentence + "\n**" + tag.upper() + "**\n\n") # cool shit stolen from http://stackoverflow.com/questions/14374181/moving-back-an-iteration-in-a-for-loop def repeatable(it): buf, it = None, iter(it) while True: if buf is None: # the buffer is empty, send them the next elem
from nltk.tokenize.punkt import PunktSentenceTokenizer, PunktParameters punkt_param = PunktParameters() punkt_param.abbrev_types = set(['dr', 'vs', 'mr', 'mrs', 'prof', 'inc']) sentence_splitter = PunktSentenceTokenizer(punkt_param) text = "is THAT what you mean, Mrs. Hussey?" sentences = sentence_splitter.tokenize(text)
import nltk from nltk.tree import Tree import os.path from PreProcessing import parsers #edit this when changind dirs LangPaths =os.path.realpath("C:/users/rihanna/Documents/Pol/ThesisIt/SumMe/Summarizer/langdetector/profiles/") tltagger = nltk.data.load("taggers/filipino_aubt.pickle") #filipino pos tagger tlChunker = nltk.data.load("chunkers/filipino_ub.pickle")#filipino chunker here enChunker = nltk.data.load("chunkers/conll2000_ub.pickle") #enChunkerhere punkt_param = PunktParameters() #creates an opening for tokenizer parameters. punkt_param.abbrev_types = set(['gng','mr','mrs','dr','rep']) #abbreviations further accepted goes here sentence_splitter = PunktSentenceTokenizer(punkt_param) tokenized = "" gateway = JavaGateway() detector = gateway.entry_point detector.init(LangPaths) def LangDetect(str): return detector.detect(str) def tokenizer(str): #print(wordpunct_tokenize(str)) return wordpunct_tokenize(str)
def getSplitter(): punkt_param = PunktParameters() punkt_param.abbrev_types = ABBREVS return PunktSentenceTokenizer(punkt_param)