def __init__(self):
     with open(TREETAGGER_ABBREVIATIONLIST, mode='r', encoding='utf-8') as f:
         abbr = set([l.strip('.\n') for l in f.readlines()])
     
     punkt_param = PunktParameters()
     punkt_param.abbrev_types = abbr 
     self.tokenizer = PunktSentenceTokenizer(punkt_param)
Exemple #2
1
def parseTextToSentences(text):
    punkt_param = PunktParameters()
    punkt_param.abbrev_types = set(['dr', 'vs', 'mr', 'ms', 'mrs', 'prof', 'inc', 'no', 'e.g', 'i.e'])
    sentence_splitter = PunktSentenceTokenizer(punkt_param)
    data = text
    data = data.replace('?"', '? "').replace('!"', '! "').replace('."', '. "')

    sentences = []
    for para in data.split('\n'):
        if para:
            sentences.extend(sentence_splitter.tokenize(para))
    return sentences
 def summarize(self):
     punkt_param = PunktParameters()
     punkt_param.abbrev_types = set(['dr', 'vs', 'mr', 'mrs', 'prof', 'inc'])
     sentence_splitter = PunktSentenceTokenizer(punkt_param)
     sentences = sentence_splitter.tokenize(self.text)
     structure = {}
     sentence_objects = []
     for idx in range(len(sentences)):
         obj = {'text' : sentences[idx], 'index' : idx , 'data': {}}
         sentence_objects.append(obj)
     structure['sentences'] = sentence_objects
     self.sentencecount = len(structure['sentences'])
     structure['ordered'] = []
     structure['weights'] = {'words' : FreqDist(nltk.word_tokenize(preprocess(self.text))), 'total': 0, 'transformed': 0}
     structure['weights']['total'] = sum(structure['weights']['words'].values())
     self.sentenceIndex = 0
     for each_sent in structure['sentences']:
         each_sent['data']['tokens'] = nltk.word_tokenize(preprocess(each_sent['text']))
         each_sent['data']['sinTransform'] = (1-math.sin(self.sentenceIndex*(math.pi/self.sentencecount)))+1
         for each_word in structure['weights']['words']:
             if each_word in each_sent['data']['tokens']:
                 structure['weights']['words'][each_word] *= each_sent['data']['sinTransform']
         self.sentenceIndex += 1
     structure['weights']['transformed'] = sum(structure['weights']['words'].values())
     self.sentenceIndex = 0
     for each_sent in structure['sentences']:
         each_sent['data']['weights'] = {'words': self.calculate_relative_frequence(each_sent['data']['tokens'], structure['weights']['words']), 'total': 0}
         each_sent['data']['weights']['total'] = sum(each_sent['data']['weights']['words'].values())
         self.sentenceIndex += 1
     structure['ordered'] = sorted(structure['sentences'], key=lambda x:x['data']['weights']['total'], reverse=True)
     structure_keep = structure['ordered'][:self.quota]
     structure_keep.sort(key=lambda x:x['index'])
     for eac_sen in structure_keep:
         self.summary.append(eac_sen['text'])
Exemple #4
0
 def tokenise(self, sample):
     # first pass - look for poems
     verses = self.scan_for_verse(sample)
     if verses:
         self.notes.append("got {} verses".format(len(verses)))
         verses = [ re.sub(r'\[\d+\]', '', v) for v in verses ]
     else:
         verses = []
     # second pass - look for sentences
     text = re.sub(r'\[\d+\]', '', sample)
     text = re.sub("\r\n", ' ', text)
     punkt_param = PunktParameters()
     punkt_param.abbrev_types = set(self.cf['abbreviations'])
     tokenizer = PunktSentenceTokenizer(punkt_param)
     sentences = tokenizer.tokenize(text)
     sentences = sentences[1:-1]
     self.notes.append("got {} sentences".format(len(sentences)))
     # remove any sentences which we already found as part of verses
     for s in sentences:
         matches = [ v for v in verses if s[:SENTENCE_MATCH] in v ]
         if matches:
             self.notes.append("found sentence {} in verses {}".format(s, matches))
             sentences.remove(s)
     verses.extend(sentences)
     return verses
 def fractal_representation(self):
     punkt_param = PunktParameters()
     for each_paragraph in self.paragraphs:
         buffer_p = paragraph()
         buffer_p.paragraph = each_paragraph
         buffer_p.tokens = nltk.word_tokenize(preprocess(each_paragraph))
         buffer_p.weights['words'] = FreqDist(buffer_p.tokens)
         buffer_p.weights['total'] = {'words':0, 'sentences':0}    
         punkt_param.abbrev_types = set(['dr', 'vs', 'mr', 'mrs', 'prof', 'inc'])
         sentence_splitter = PunktSentenceTokenizer(punkt_param)
         sentences = sentence_splitter.tokenize(each_paragraph)
         for each_sentence in sentences:
             self.stotal += 1
             buffer_s = sentence()
             buffer_s.sentence = each_sentence
             buffer_s.tokens = nltk.word_tokenize(preprocess(each_sentence))
             if len(buffer_s.tokens) > 0:
                 buffer_s.weights['sentence'] = FreqDist(buffer_s.tokens)
                 buffer_s.weights['paragraph'] = self.calculate_relative_frequence(buffer_s.tokens, buffer_p.weights['words'])
                 buffer_s.weights['document'] = self.calculate_relative_frequence(buffer_s.tokens, self.fractal.weights)
                 buffer_s.weights['total'] = {}
                 buffer_s.weights['total']['sentence'] = 1
                 buffer_s.weights['total']['paragraph'] = sum(buffer_s.weights['paragraph'].values())
                 buffer_s.weights['total']['document'] = sum(buffer_s.weights['document'].values())
                 self.s_weight += buffer_s.weights['total']['document']
                 buffer_p.weights['total']['sentences'] += buffer_s.weights['total']['document']
                 buffer_p.sentences.append(buffer_s)
         buffer_p.weights['total']['words'] = sum(buffer_p.weights['words'].values())
         self.fractal.paragraphs.append(buffer_p)
         self.pindex += 1
Exemple #6
0
def getSentences(text):
	#returns a list of sentences tokenized by Punkt
	punkt_param = PunktParameters()
	punkt_param.abbrev_types = set(['dr', 'vs', 'mr', 'mrs', 'prof', 'inc'])
	sentence_splitter = PunktSentenceTokenizer(punkt_param)
	sentences = sentence_splitter.tokenize(text)
	return sentences
Exemple #7
0
def retrieveUrlText(url):
    try:
        config = Config()
        config.request_timeout = 1000
        config.memoize_articles = False
        config.fetch_images = False
        config.browser_user_agent = 'Mozilla/5.0'
        article = Article(url, config)
        article.download(recursion_counter=5)
        if article.download_state != 2:
            return ''
        article.parse()
        articleText = article.text.replace('\n', ' ')
    except KeyboardInterrupt:
        raise
    except Exception:
        return ''
    punkt_param = PunktParameters()
    punkt_param.abbrev_types = set([
        'dr', 'vs', 'mr', 'mrs', 'prof', 'inc', 'et', 'al', 'fig', 'figs',
        'chem', 'ph'
    ])
    sentence_splitter = PunktSentenceTokenizer(punkt_param)
    articleSentences = validateSentences(
        sentence_splitter.tokenize(articleText))
    return articleSentences
 def _split_sentences(self, text):
     from nltk.tokenize.punkt import PunktSentenceTokenizer, PunktParameters
     punkt_param = PunktParameters()
     punkt_param.abbrev_types = set(['dr', 'vs', 'mr', 'mrs', 'prof', 'inc'])
     sentence_splitter = PunktSentenceTokenizer(punkt_param)
     sentences = sentence_splitter.tokenize(text)
     return sentences
Exemple #9
0
    def __getlemmas(self, txt):
        '''
            Filters noun, adjective and verb from input text, lemmatize them 
            and returns as list of words(tokens)
            
            Parameters:
                @txt  : The text file (str format) which must be lemmatized
        '''

        lemma = WordNetLemmatizer()
        punkts = PunktParameters()
        punkts.abbrev_types = set(['dr', 'vs', 'mr', 'mrs', 'prof', 'inc'])
        sent_tokenizer = PunktSentenceTokenizer(punkts)
        sentences = sent_tokenizer.tokenize(txt)

        lemma_tokens = []
        for sentence in sentences:
            stoken = word_tokenize(sentence)
            pos_sent = pos_tag(stoken)

            for p in pos_sent:
                if p[1].startswith('N'):
                    pos = wordnet.NOUN
                elif p[1].startswith('J'):
                    pos = wordnet.ADJ
                elif p[1].startswith('V'):
                    pos = wordnet.VERB
                else:
                    pos = None

                if pos:
                    lemma_tokens.append(lemma.lemmatize(p[0].lower(), pos))

        return lemma_tokens
def read_docx(path):
    """read .docx (Microsoft 2007+)
    """
    try:
        doc = docx.Document(path)

        punkt_param = PunktParameters()
        punkt_param.abbrev_types = set(['fig'])
        tokenizer = PunktSentenceTokenizer(punkt_param)

        body = []
        for p in doc.paragraphs:
            body += tokenizer.tokenize(clean_text(p.text))
        body = '\n'.join(body)

        tables = []
        for t in doc.tables:
            table = {'cells': []}
            for row in t.rows:
                row_elements = []
                for cell in row.cells:
                    for p in cell.paragraphs:
                        row_elements.append({'text': clean_text(p.text)})
                table['cells'].append(row_elements)
            tables.append(table)

        data = PaperData(body, tables)
    except Exception:
        logger.info('fail: %s', path)
        traceback.print_exc()
        return PaperData()

    return data
Exemple #11
0
    def process_doc(self, xmlfile):

        # Set up sentence tokenizer
        punkt_param = PunktParameters()
        # Domain specific abbreviations
        punkt_param.abbrev_types = set(["e.g", "al", "i.e"])
        sent_tokenize = PunktSentenceTokenizer(punkt_param).tokenize

        tree = etree.parse(xmlfile)
        algrthms = tree.getroot()
        block = algrthms.iterdescendants(["sectionHeader", "bodyText"])
        section = ""
        counter = 0
        sentences = []
        try:
            while True:
                blk = block.next()
                if blk.tag == "sectionHeader":
                    section = blk.get("genericHeader")
                    # check if the next blk is bodytext. it might be a section
                    sentences = sent_tokenize(remove_crlf(block.next().text))
                    self.update_section(section, OrderedDict(enumerate(sentences, start=counter)))
                else:
                    sentences = sent_tokenize(remove_crlf(blk.text))
                    self.update_section(section, OrderedDict(enumerate(sentences, start=counter)))
                counter += len(sentences)
        except StopIteration:
            pass
        except Exception as e:
            logit("Something went wrong while processing the document!")
            logit(section)
            logit(str(e))
Exemple #12
0
def clean(text):
    # Returns cleaned, tokenized documents from raw HTML text.

    text = cleanmyhtml(url)

    # We need to remove things like (R-NE). There are some wacky abbreviations
    # for states, but all fall under five.
    text = re.sub(r'\w{1}\-\w{1,5}\.', '', text)

    # U.S. needs to become US or else it'll tokenize weirdly. Same with
    # H.R. (house resolution).
    text = re.sub(r'U\.S\.', 'US', text)
    text = re.sub(r'H\.R\.', 'HR', text)

    # NLTK is pretty poor at tokenizing sentences that contain ." or .'
    # We'll insert a space into these.

    text = re.sub(r'\.\"', '. \"', text)
    text = re.sub(r'\"\.', '. \'', text)

    punkt_param = PunktParameters()
    punkt_param.abbrev_types = set([
        'dr', 'reps', 'Reps', 'H.R', 'h.r', 'hr', 'HR', 'vs', 'mr', 'ms',
        'pres,', 'mrs', 'prof', 'inc', 'sens', 'Sens', 'Sen', 'sen'
    ])
    sentence_splitter = PunktSentenceTokenizer(punkt_param)
    sentences = sentence_splitter.tokenize(text)
    return (sentences)
Exemple #13
0
def summarize(text, ref='', lines=7):
    text = re.sub(r'\[[0-9]*\]', ' ', text)
    text = re.sub(r'\s+', ' ', text)
    clean_text = text.lower()
    clean_text = re.sub(r'\W', ' ', clean_text)
    clean_text = re.sub(r'\d', ' ', clean_text)
    clean_text = re.sub(r'\s+', ' ', clean_text)
    punkt_param = PunktParameters()
    punkt_param.abbrev_types = set(
        ['dr', 'vs', 'mr', 'mrs', 'prof', 'inc', 'i.e'])
    sentence_splitter = PunktSentenceTokenizer(punkt_param)
    text = text.replace('?"', '? "').replace('!"', '! "').replace('."', '. "')
    sentences = sentence_splitter.tokenize(text)
    #sentences = nltk.sent_tokenize(text)
    stop_words = nltk.corpus.stopwords.words('english')

    word_count = {}
    for word in nltk.word_tokenize(clean_text):
        if word not in stop_words:
            word_count[word] = word_count.get(word, 0) + 1

    sentence_score = {}
    i = 0
    for s in sentences:
        for word in nltk.word_tokenize(s.lower()):
            if word in word_count.keys():
                old = sentence_score.get(s, (0, 0, i))
                i += 1
                sentence_score[s] = (old[0] + word_count[word], old[1] + 1,
                                     old[2])

    def score(pair):
        return (pair[0] - pair[2]) / pair[1]

    scores = {}
    for s in sentence_score.keys():
        if sentence_score[s][1] > 2:
            scores[s] = score(sentence_score[s])
        else:
            scores[s] = score(sentence_score[s]) - 100

    best_sentences = heapq.nlargest(lines, scores, key=scores.get)
    best_sentences.sort(key=lambda x: sentence_score[x][2])

    string = ''

    for s in best_sentences:
        if s[0] == ' ':
            s = s[1:]
        if 'refer' in s and len(scores.keys()) < 4:
            print('Please be more specific\n')
            if len(ref) > 1:
                print('Here are some suggestions:')
            for i in range(len(ref)):
                print("=>", ref[i])
            print('\n')
            return
        print(s)
        string += s + '\n'
    return string
Exemple #14
0
def _punkt_sent_tokenize(text):
    '''
     Sentence segmentation using nltk PunktSentenceTokenizer.
    '''
    punkt_param = PunktParameters()
    punkt_param.abbrev_types = set(config.tokenize_abbrev)
    sentence_splitter = PunktSentenceTokenizer(punkt_param)
    return sentence_splitter.tokenize(text)
 def _split_text_to_sentences(self, text):
     # splits text to sentences (uses some utilities from nltk)
     punkt_param = PunktParameters()
     punkt_param.abbrev_types = set(
         ['dr', 'vs', 'mr', 'mrs', 'prof', 'inc'])
     sentence_splitter = PunktSentenceTokenizer(punkt_param)
     sentences = sentence_splitter.tokenize(text)
     return sentences
Exemple #16
0
 def sentenceToken(self, text):
     """Split review context into a list of sentences.
     Text: a sentence.
     """
     punkt_param = PunktParameters()
     punkt_param.abbrev_types = set(['dr', 'vs', 'mr', 'mrs'])
     tokenizer = PunktSentenceTokenizer(punkt_param)
     return tokenizer.tokenize(text)
Exemple #17
0
def parse (text):
    """Use nltk's PunktSentenceTokenizer to convert the text string into
    a list of English-language sentences."""

    punkt_param = PunktParameters()
    punkt_param.abbrev_types = set(ABBREVIATIONS)
    sentence_splitter = PunktSentenceTokenizer(punkt_param)

    return sentence_splitter.tokenize(preprocess(text))
Exemple #18
0
def tokenize_to_sentences2(doc):
    punkt_param = PunktParameters()
    abbreviations = [
            "u.s.a", "fig", "gov", "sen", "jus", "jdg", "rep", "pres",
            "mr", "mrs", "ms", "h.r", "s.", "h.b", "s.b", "u.k", "u.n",
            "u.s.s.r",
    ]
    punkt_param.abbrev_types = set(abbreviations)
    tokenizer = PST(punkt_param)
    return tokenizer.tokenize(doc)
    def __init__(self, abbrev=['dr', 'vs', 'mr', 'mrs', 'prof', 'inc', 'i.e']):
        """Initialize Textualizer.

        Usually, you need to create only one textualizer in your script.

        Args:
            abbrev (list): List of abbreviations
        """
        punkt = PunktParameters()
        punkt.abbrev_types = set(abbrev)
        self.tokenizer = PunktSentenceTokenizer(punkt)
    def _nltk_tokenizer(self, document):
        abbreviation = ['sra', 'dª', 'dña', 'sras', 'sres', 'sr', 'excmos', 'excmo', 'excma', 'excmas', 'ilma', 'ilmas',
                        'ilmo', 'ilmos', 'ilma', 'ilmas', 'art', 'arts', 'núm', 'cp', 'c.p', 's.l', 'rcud', 'rcuds', 'rec']

        punkt_param = PunktParameters()

        punkt_param.abbrev_types = set(abbreviation)
        sentence_splitter = PunktSentenceTokenizer(punkt_param)
        text = document
        sentences = sentence_splitter.tokenize(text)

        return sentences
Exemple #21
0
    def rank(self,
             doc="doc.txt",
             out="doc_textrank.txt",
             top=10,
             stop_word=False,
             stem=False):
        with open(doc, "r") as lofile:
            document = lofile.read()

        # == refine document for process ==
        document = document.replace('\n', ' ')\
            .replace('."', '. "').replace('?"', '? "').replace('!"', '! "')\
            .replace('.”', '. ”').replace('?”', '? ”').replace('!”', '! ”')\
            .decode('utf-8')
        # document = ' '.join(document.strip().split('\n'))

        # == sentences tokenize ==
        punkt_param = PunktParameters()
        punkt_param.abbrev_types = set(
            ['dr', 'vs', 'mr', 'mrs', 'prof', 'inc'])
        sentence_tokenizer = PunktSentenceTokenizer(punkt_param)
        sentences = sentence_tokenizer.tokenize(document)

        # FEATURE: stem words, remove stop words

        # == count words for each sentence ==
        wordCounter = CountVectorizer()  # approach 0: non
        if stop_word:
            wordCounter = CountVectorizer(
                stop_words='english')  # approach 1: only stop_word
            if stem:
                wordCounter = CountVectorizer(
                    stop_words='english',
                    preprocessor=stemSen)  # approach 2: stop_word & stem
        count_matrix = wordCounter.fit_transform(sentences)
        normalized_matrix = TfidfTransformer().fit_transform(count_matrix)

        # wordCounter = TfidfVectorizer()
        # normalized_matrix = wordCounter.fit_transform(sentences)

        # == similarity among sentences ==
        similarity_graph = normalized_matrix * normalized_matrix.T
        nx_graph = nx.from_scipy_sparse_matrix(similarity_graph)
        scores = nx.pagerank(nx_graph)
        orderedSentences = sorted(
            ((scores[i], s) for i, s in enumerate(sentences)), reverse=True)

        if len(orderedSentences) < top:
            top = len(orderedSentences)
        with open(out, "w") as lofile:
            for i in range(0, top):
                lofile.write(orderedSentences[i][1].encode('ascii', 'ignore'))
                lofile.write("\n")
Exemple #22
0
def nltk_get_tokenizer():
    """
    Return a tokenizer with some customization for Hansard
    :return:  a Punkt tokenizer
    """
    # With thanks to
    # https://stackoverflow.com/questions/34805790/how-to-avoid-nltks-sentence-tokenizer-spliting-on-abbreviations
    punkt_param = PunktParameters()
    # 'hon. Gentleman' is very common in Hansard!
    abbreviation = ['hon', 'mr', 'mrs', 'no']
    punkt_param.abbrev_types = set(abbreviation)
    return PunktSentenceTokenizer(punkt_param)
    def segmentPureText(self, txtfile):
        punkt_param = PunktParameters()
        abbreviation = [
            "U.S.A", "u.s.a", "figure", "fig", "Table", "table", "Eq", "eq",
            "equation", "et al", "e.g", "i.e", "Fig", "s.d", "etc", "i.v"
        ]
        punkt_param.abbrev_types = set(abbreviation)
        tokenizer = PunktSentenceTokenizer(punkt_param)
        tokenized_output = tokenizer.tokenize(txtfile)
        # print(tokenized_output)

        return tokenized_output
def parse_text_to_sentences(text):
    punkt_param = PunktParameters()
    punkt_param.abbrev_types = set(
        ['dr', 'vs', 'mr', 'ms', 'mrs', 'prof', 'inc', 'no', 'e.g', 'i.e'])
    sentence_splitter = PunktSentenceTokenizer(punkt_param)
    data = text
    data = data.replace('?"', '? "').replace('!"', '! "').replace('."', '. "')

    sentences = []
    for para in data.split('\n'):
        if para:
            sentences.extend(sentence_splitter.tokenize(para))
    return sentences
def tokenize_sentences(text):
    from nltk.tokenize.punkt import PunktSentenceTokenizer, PunktParameters, PunktLanguageVars

    class CommaPoint(PunktLanguageVars):
        sent_end_chars = ('.', '?', '!')

    punkt_param = PunktParameters()
    punkt_param.abbrev_types = set(['dr', 'vs', 'al', 'i.v'])
    sentence_splitter = PunktSentenceTokenizer(punkt_param,
                                               lang_vars=CommaPoint())
    sentences = sentence_splitter.tokenize(text)

    return sentences
Exemple #26
0
def keyword_sentiment():

    ## take in tht input
    word = sys.argv[1]
    date_diff = int(sys.argv[2])
    
    ## create a sentence_tokenizer
    from nltk.tokenize.punkt import PunktSentenceTokenizer, PunktParameters
    punkt_param = PunktParameters()
    punkt_param.abbrev_types = set(['dr', 'vs', 'mr', 'mrs', 'prof', 'inc','1','2','3','4','5','6','7','8','9','10','11','12','13','14','15','16','17','18','19','20'])
    sent_tokenizer = PunktSentenceTokenizer(punkt_param)
    
    ## caluclate the barrier date
    DD = datetime.timedelta(days=date_diff)
    barrier_date = datetime.datetime.now()- DD

    ## make connection to db and fetch tweets (and respective sentiment) above the barrier_date
    db = MySQLdb.connect(host="localhost",user="******",passwd="{2qGq(22+5iU",db="Insights")
    cur = db.cursor()
    sql = "SELECT Phrase,Sentiment FROM Phrases WHERE `Date`>'"+str(barrier_date)+"';"
    cur.execute(sql)

    total_sentiment = 0
    total_count = 0
    ## locate tweets which contain keyword, tokenize them into sentences
    for row in cur.fetchall():
        if(row[0].lower().find(word.lower())!=-1):
            sentences = sent_tokenizer.tokenize(row[0])
            
    ## if a single sentence then just take the sentiment from db
            if len(sentences) == 1:
                total_sentiment = total_sentiment + float(row[1])
                total_count = total_count+1
                
    ## else add together sentiment of sentence and keep the count
            else:
                for sentence in sentences:
                        blob = TextBlob(sentence)
                        total_sentiment= total_sentiment + int(blob.sentiment.polarity*1000)/1000.0
                        if(sentence.lower().find(word.lower())!=-1):
                            total_count = total_count+1
                            
    ## json the total_sentiment/count and count
    if(total_count!=0):
        json_array = json_array = [{"sentiment": int(total_sentiment/total_count*1000)/1000.0, "count": total_count}]
    else:
        json_array = json_array = [{"sentiment": 0, "count": 0}]
    ## close the connection to the db
    db.close()
    ## print the json
    print(json.dumps(json_array))
Exemple #27
0
def split_sentences(text):
    """Divides text into sentences. Return list of sentences.

    :param text:
    :return: list of sentences
    """
    from nltk.tokenize.punkt import PunktSentenceTokenizer, PunktParameters
    punkt_param = PunktParameters()
    # TODO: buraya türkçe kısaltmalar eklenebilir. Cuneyd hocanın dediği dr. tarzı kısaltmalar
    punkt_param.abbrev_types = set(['dr', 'vs', 'mr', 'mrs', 'prof', 'inc'])
    sentence_splitter = PunktSentenceTokenizer(punkt_param)
    sentences = sentence_splitter.tokenize(text)

    return sentences
Exemple #28
0
def splitIntoSentences2(file_name):
  punkt_param = PunktParameters()
  punkt_param.abbrev_types = set(['dr', 'vs', 'mr', 'mrs', 'prof', 'inc'])
  sentence_splitter = PunktSentenceTokenizer(punkt_param)
  fp = open(file_name)
  data = fp.read()
  data = data.replace('?"', '? "').replace('!"', '! "').replace('."', '. "')

  sentences = []
  for para in data.split('\n'):
    if para:
      sentences.extend(sentence_splitter.tokenize(para))
  # print '\n-----\n'.join(sentences)
  return sentences
Exemple #29
0
    def loadCorpus(self, path):
        
        for encoding in self.__encodings:

            try:
                self.__path = path
                fileName = codecs.open( self.__path,'r', encoding=encoding )
                self.__rawText = fileName.read()
                break
            
            except UnicodeDecodeError:
                encoding = ''
                continue
                 
        if encoding!='':
            self.initFields()
            
            #SENTENCES
            # more abbreviations with dots
            punkt_param = PunktParameters()
            punkt_param.abbrev_types = set(['dr', 'vs', 'n', 'v', 'etc', 'art', 'p', 'Cost', 'ss', 'pag'])
            
            punkt_param = PunktParameters()
            sentence_splitter = PunktSentenceTokenizer(punkt_param)
            text = re.sub(ur'[\'\<\>`’]', ' ', self.__rawText)
            #text = re.sub('(\d+)', r' \1 ', text)
            sentences = sentence_splitter.tokenize(text)
            
            #TOKENS
            self.__tokens = [[token, ''] for token in list(itertools.chain(*[ customWordtokenize(sent) for sent in sentences]))]
            wordTokenizer = RegexpTokenizer('[a-zA-Z0-9\xe0\xe1\xe8\xe9\xec\xed\xf2\xf3\xf9\xfa]+')
            #wordTokenizer = RegexpTokenizer('[\w]+')
            
            sentences = [wordTokenizer.tokenize(sent.lower()) for sent in sentences if len(wordTokenizer.tokenize(sent)) > 0]
            words =  list(itertools.chain(*sentences))
            self.__words = words
            self.__sentences = sentences
            
            self.__avgSentLength = round(np.mean( [len(sent) for sent in sentences]), 3)
            self.__avgWordLength = round(np.mean( [len(word) for word in words]), 3)
            self.__freqDist = FreqDist(words)
            self.__wordCount = len(words)
            self.__lexicalDiversity = round(len(self.__freqDist.items())/float(len(words)), 5)
            
            ### resetting members
            self.__concordanceIndex = None
            self.__bigrams = None
                 
        return encoding
def getSentences(paragraph):

	unicode_data= paragraph.decode("utf-8")
	data= "".join([i if ord(i) < 128 else "" for i in unicode_data])

	tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')
	punkt_params = PunktParameters()
	punkt_params.abbrev_types = set(['al',"inc","mr","dr","mrs","prof"])
	splitter = PunktSentenceTokenizer(punkt_params)

	sentences=splitter.tokenize(data)
	
	sentences1=filter_list(sentences)
	##print sentences1,"\n----------------------------------------------------------------------------"
	return sentences1
Exemple #31
0
def sentence_splitter(lang):
    """

    :type lang: str
    :rtype: nltk.tokenize.punkt.PunktSentenceTokenizer
    """
    punkt_param = PunktParameters()
    path = os.path.dirname(__file__)
    ab_file = ''.join([path, SUBFOLDER, lang])
    if os.path.isfile(ab_file):
        punkt_param.abbrev_types = set(abbreviation_loader(ab_file))
    else:
        logging.info('Abbreviation file not found for language: %s', lang)
    splitter = PunktSentenceTokenizer(punkt_param)
    return splitter
Exemple #32
0
def split_paragraph_into_sentence(text):
    punkt_param = PunktParameters()
    abbreviation = [
        'i.e', 'mr', 'st', 'mrs', 'dr', 'ms', 'fig', 'u.s.a', 'a.d', 'a.m',
        'cap', 'cf', 'cp', 'c.v', 'al', 'etc', 'e.g', 'ff', 'id', 'i.a', 'i.e',
        'lb', 'll.b', 'm.a', 'n.b', 'op.cit', 'p.a', 'ph.d', 'p.m', 'p.p',
        'prn', 'pro tem', 'p.s', 'q.d', 'q.e.d', 'q.v', 're', 'reg', 'r.i.p',
        's.o.s', 'stat', 'vis', 'vs', 'et al', 'et.al', 'etc', 'e.g', 'i.e',
        'eq', 'a.e', 'a.e', 'cf', 'con', 'const', 'fig', 's.t', 'st', '(', ')',
        '?('
    ]
    punkt_param.abbrev_types = set(abbreviation)
    tokenizer = PunktSentenceTokenizer(punkt_param)
    sentences = tokenizer.tokenize(text.lower())
    return replace_specieal_characters(sentences)
Exemple #33
0
def bill_sent_chunk_tokenize(bill_text, min_sentence_length=20):

    punkt_param = PunktParameters()
    punkt_param.abbrev_types = set([
        'dr', 'vs', 'mr', 'mrs', 'prof', 'inc', '1', '2', '3', '4', '5', '6',
        '7', '8', '9'
    ])
    sentence_splitter = PunktSentenceTokenizer(punkt_param)

    bill_sentences = sentence_splitter.tokenize(bill_text)

    bill_sentences = [
        s for s in bill_sentences if len(s) > min_sentence_length
    ]

    return bill_sentences
def getSentences(paragraph):

    unicode_data = paragraph.decode("utf-8")
    data = "".join([i if ord(i) < 128 else "" for i in unicode_data])

    ##tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')
    punkt_params = PunktParameters()
    punkt_params.abbrev_types = set(
        ['al', "inc", "mr", "dr", "mrs", "prof", "etal"])
    splitter = PunktSentenceTokenizer(punkt_params)

    sentences = splitter.tokenize(data)

    sentences1 = filter_list(sentences)
    ##print sentences1,"\n----------------------------------------------------------------------------"
    return sentences1
 def __init__(self):
     self.result = []
     punkt_param = PunktParameters()
     punkt_param.abbrev_types = set(['г', 'гор', 'ул', 'кв', 'д', 'корп', 'эт', 'стр', 'пер', 'просп', 'тел', 'зам', 'каб', 'гос', 'мед'])
     self.sentence_splitter = PunktSentenceTokenizer(punkt_param)
     self.command = "./mystem -dig --eng-gr --format json < input.txt > mystem.json"
     self.verbs = [] #[u"отметил", u"сказал", u"подчеркнул", u"сообщил"]
     self.auxV_Author_reverse = [] #[u"[Пп]о .... словам", u"[Пп]о данным", u"[Пп]о сообщению", ]
     self.dividers = {}
     self.load_verbs()
     self.dividersF() #делаем разделители речи и автора
     self.dividersREG = []
     self.authID = 1
     self.mystem_authors = u""
     self.authors = {}
     self.start = time.clock()
Exemple #36
0
def plagiarismChecker():
    text = request.form['text_to_check']
    if (text.lstrip().rstrip() == ''):
        return render_template('input.html')
    punkt_parameters = PunktParameters()
    sentence_splitter = PunktSentenceTokenizer(punkt_parameters)
    sentences = sentence_splitter.tokenize(text)
    probability_of_plagiarism = 0
    for a_sentence in sentences:
        time.sleep(0.1)
        content = list(filter(lambda x: x in string.printable, a_sentence))
        str1 = ''.join(content)
        print(str1)
        # temp=list(content)
        # print(str(temp))
        the_term = urllib.parse.quote('+' + '"' + str1 + '"')
        page = requests.get('https://www.bing.com/search?q=' + the_term)
        print(page.url)
        if ((not "There are no results for" in page.text)
                and (not "No hay resultados para" in page.text)
                and (not "are no results for" in page.text)):
            probability_of_plagiarism += 1
    percent_plagiarised = str(
        (probability_of_plagiarism / len(sentences)) * 100) + '%'
    return render_template('results.html',
                           text=text,
                           percent_plagiarised=percent_plagiarised)
Exemple #37
0
def filtered_sentences(article, debug=False):
    #get sentences from the article
    punkt_param = PunktParameters()
    punkt_param.abbrev_types = set(['dr', 'vs', 'mr', 'mrs', 'prof', 'inc', 'e.g', 'i.e'])
    sentence_splitter = PunktSentenceTokenizer(punkt_param)
    sentences = sentence_splitter.tokenize(article.strip())
    #tokenize all of the sentences
    sentences = [(nltk.word_tokenize(sentence), sentence) for sentence in sentences]
    #throw out sentences with no linking verb
    sentences = filter(short_enough, sentences)
    sentences = filter(next(all_linking(sentence), None), sentences)
    #pos tag the remaining sentences
    sentences = [sentence_to_features(sentence) for sentence in sentences]
    #filter(good_enough, sentences)
    sorted(sentences, key=goodness)
    return [sentence[1] for sentence in sentences]
Exemple #38
0
def sentence_tokenizer(text):
    """
    Tokenizes sentences.

    :param text:
    :return: list of sentences (a sentence is a string)
    """
    punkt_param = PunktParameters()
    punkt_param.abbrev_types = {
        'zzgl', 'prof', 'ca', 'vj', 't', 'mio', 'sro', 'lv', 'io', 'ihv',
        'bzw', 'usw', 'inkl', 'zt', 'vh', 'dr', 'entspr', 'dem', 'fort', 'co',
        'kg', 'zb', 'bspw', 'ua', 'rd', 'abs', 'etc', 'tsd', 'z.b', 'evtl',
        '1', '2', '3', '4', '5', '6', '7', '8', '9', '19', '20', '21'
    }
    sentence_splitter = PunktSentenceTokenizer(punkt_param)
    return sentence_splitter.tokenize(text)
def split_into_sentences(text):
    # splits the text into sentences and also preserves the corresponding starting and ending indices
    startIndices = []
    endIndices = []
    corpus = []
    punkt_param = PunktParameters()
    punkt_param.abbrev_types = set(
        ['dr', 'doc', 'mr', 'mrs', 'prof', 'inc', 'mgr', 'ing', 'st'])
    sentence_splitter = PunktSentenceTokenizer(punkt_param)

    for start, end in sentence_splitter.span_tokenize(text):
        startIndices.append(start)
        endIndices.append(end)
        token = text[start:end]
        corpus.append(token)
    return startIndices, endIndices, corpus
Exemple #40
0
def myNLTKParser(document, tagger):
    lexical_diversity = len(document) / len(set(document)) * 1.0

    punkt_param = PunktParameters()
    # if any customized abbrev
    #punkt_param.abbrev_types = set(['dr', 'vs', 'mr', 'mrs', 'prof', 'inc'])

    # tokenize to sentence
    sentence_splitter = PunktSentenceTokenizer(punkt_param)
    sentences = sentence_splitter.tokenize(document.replace('\'s', '_s'))

    # tokenize sentence to words
    word_tokens = [[
        w.strip() for w in nltk.word_tokenize(s)
        if not w.strip().lower() in stopwords
    ] for s in sentences]

    # extend token to bigram and trigram
    extended_tokens = []
    for token_list in word_tokens:
        extended_tokens.append(token_list + nltk.bigrams(token_list) +
                               nltk.trigrams(token_list))

    # word stemmer to normalize
    p_stemmer = PorterStemmer()
    stem_tokens = []
    for token_list in word_tokens:
        stem_tokens.append([p_stemmer.stem(w) for w in token_list])

    # POS tags
    tags = [tagger.tag(a) for a in extended_tokens]

    tags_of_verbs = ['NN', 'VB', 'VBP', 'VBG']
    tags_of_interest = [
        'JJ', 'JJR', 'JJS', 'NN', 'NNP', 'NNPS', 'NNS', 'RB', 'RBR', 'RBS'
    ]
    tags_of_noun = ['NN']
    merged_tags_uni = [
        word for sublist in tags for (word, tag) in sublist
        if tag in tags_of_verbs and isinstance(word, tuple) == False
    ]
    merged_tags_bi = [
        word for sublist in tags for (word, tag) in sublist if
        tag in tags_of_interest and isinstance(word, tuple) and len(word) == 2
    ]
    merged_tags_tri = [
        word for sublist in tags for (word, tag) in sublist if
        tag in tags_of_interest and isinstance(word, tuple) and len(word) == 3
    ]

    uni_tags_fd = nltk.FreqDist(merged_tags_uni)
    bi_tags_fd = nltk.FreqDist(merged_tags_bi)
    tri_tags_fd = nltk.FreqDist(merged_tags_tri)

    return {
        'uni_fd': uni_tags_fd.max(),
        'bi_fd': bi_tags_fd.max(),
        'tri_fd': tri_tags_fd.max(),
    }
Exemple #41
0
def get_important_sent(html_content):
    punkt_param = PunktParameters()
    punkt_param.abbrev_types = set([
        'dr', 'vs', 'mr', 'mrs', 'miss', 'prof', 'inc', 'no', 'cap', 'nos',
        'vol', 'para', 'exh'
    ])
    tokenizer = nltk.PunktSentenceTokenizer(punkt_param)
    soup = BeautifulSoup(html_content, 'html.parser')
    content = soup.get_text()
    paras = get_paras(content)
    sents = []
    for para in paras:
        para_content = content[para[0]:para[1] + 1]
        for sent in tokenizer.span_tokenize(para_content):
            sents.append(para_content[sent[0]:sent[1] + 1])
    sents = np.array(sents)
    BertTokenizer = bert.bert_tokenization.FullTokenizer(VOCAB_FILE,
                                                         do_lower_case=True)
    input_ids, input_mask, segment_ids = convert_all_sentences(
        clean_data(sents), BertTokenizer)
    model = tf.keras.models.load_model("bert_model")
    input_X = {
        "input_ids": input_ids,
        "input_mask": input_mask,
        "segment_ids": segment_ids
    }
    sents = sents[(model.predict(input_X, batch_size=1) > 0.4).reshape(-1, )]
    for sent in sents:
        segs = filter(lambda seg: seg != "", sent.split("\n"))
        for seg in segs:
            seg = seg.replace("\xa0", "&nbsp;")
            while seg:
                cur = len(seg)
                while True:
                    if not cur:
                        return html_content
                    cur_str = seg[:cur]
                    res = html_content.find(cur_str)
                    if res == -1:
                        cur -= 1
                    else:
                        html_content = add_important_class(
                            html_content, res, res + len(cur_str))
                        seg = seg[cur:]
                        break
    return html_content
Exemple #42
0
    def tokenize_sentences(self, text, word_threshold=5):
        """
        Returns a list of sentences given an input string of text.

        :param text: input string
        :param word_threshold: number of significant words that a sentence must contain to be counted
        (to count all sentences set equal to 1; 5 by default)
        :return: list of sentences
        """
        punkt_params = PunktParameters()
        # Not using set literal to allow compatibility with Python 2.6
        punkt_params.abbrev_types = set([
            'dr', 'vs', 'mr', 'mrs', 'ms', 'prof', 'mt', 'inc', 'i.e', 'e.g'
        ])
        sentence_splitter = PunktSentenceTokenizer(punkt_params)

        # 1. TOKENIZE "UNPROCESSED" SENTENCES FOR DISPLAY
        # Need to adjust quotations for correct sentence splitting
        text_unprocessed = text.replace('?"', '? "').replace('!"', '! "').replace('."', '. "')

        # Treat line breaks as end of sentence (needed in cases where titles don't have a full stop)
        text_unprocessed = text_unprocessed.replace('\n', ' . ')

        # Perform sentence splitting
        unprocessed_sentences = sentence_splitter.tokenize(text_unprocessed)

        # Now that sentences have been split we can return them back to their normal formatting
        for ndx, sentence in enumerate(unprocessed_sentences):
            sentence = unicode_to_ascii(sentence)  # Sentence splitter returns unicode strings
            sentence = sentence.replace('? " ', '?" ').replace('! " ', '!" ').replace('. " ', '." ')
            sentence = self._remove_whitespace(sentence)  # Remove excess whitespace
            sentence = sentence[:-2] if (sentence.endswith(' .') or sentence.endswith(' . ')) else sentence
            unprocessed_sentences[ndx] = sentence

        # 2. PROCESS THE SENTENCES TO PERFORM STEMMING, STOPWORDS REMOVAL ETC. FOR MATRIX COMPUTATION
        processed_sentences = [self.sanitize_text(sen) for sen in unprocessed_sentences]

        # Sentences should contain at least 'word_threshold' significant terms
        filter_sentences = [i for i in range(len(processed_sentences))
                            if len(processed_sentences[i].replace('.', '').split(' ')) > word_threshold]

        processed_sentences = [processed_sentences[i] for i in filter_sentences]
        unprocessed_sentences = [unprocessed_sentences[i] for i in filter_sentences]

        return processed_sentences, unprocessed_sentences
Exemple #43
0
def sent_tokenize(text, abbrev_list=['dr', 'vs',
                                     'etc', 'mr',
                                     'mrs', 'prof',
                                     'inc', 'et',
                                     'al', 'Fig', 'fig']):
    '''
    Tokenizes a string into sentences

    Args:
        text(str) -- The text being tokenized
        abbrev_list(list) -- a list of abbreviations followed by dot
            to exclude from tokinzation e.g. mr. ms. etc.

    Returns
        list of strings -- list of sentences
    '''
    punkt_param = PunktParameters()
    punkt_param.abbrev_types = set(abbrev_list)
    sent_detector = PunktSentenceTokenizer(punkt_param)
    return sent_detector.tokenize(text)
Exemple #44
0
def sent_tokenize(data, filter_threshold=None):
    '''
    Tokenizes a string into sentences and corresponding offsets

    Args:
        data(str): The document itself
        filter_threshold(int): if sentence length is
            less than this, it will be ignored

    Returns:
        tuple(list(str), list(list))): tokenized
            sentences and corresponding offsets
    '''
    punkt_param = PunktParameters()
    punkt_param.abbrev_types = set(
        ['dr', 'vs', 'mr', 'mrs', 'prof', 'inc', 'et', 'al', 'Fig', 'fig'])
    sent_detector = PunktSentenceTokenizer(punkt_param)
    sentences = sent_detector.tokenize(data)
    offsets = sent_detector.span_tokenize(data)
    return (sentences, offsets)
def ari(raw):

    # tokenize raw text and get words
    tokens = nltk.wordpunct_tokenize(raw)
    words = [word.lower() for word in tokens if word.isalpha()]

    # instantiate punctuation parameters
    punkt_params = PunktParameters()
    # specify abbreviations to be ignored in sentence separation
    punkt_params.abbrev_types = set(['dr', 'inc', 'mr', 'mrs', 'ms', 'prof',
                                     'etc'])
    # separate into sentences using a PuktSentenceTokenizer
    sentences = PunktSentenceTokenizer(punkt_params).tokenize(raw)

    chars = 0

    for word in words:
        chars += len(word)
    
    return (4.71 * (chars / len(words)) + 0.5 * (len(words) / len(sentences))
            - 21.43)
Exemple #46
0
def summarize(body, pmid):
    if not body:
        return('No summary avialable for PMID %d' % pmid)

    punkt_param = PunktParameters()
    punkt_param.abbrev_types = set(['et al', 'i.e', 'e.g', 'ref', 'c.f',
                                    'fig', 'Fig', 'Eq', 'eq', 'eqn', 'Eqn'])
    sentence_splitter = PunktSentenceTokenizer(punkt_param)
    sentences = sentence_splitter.tokenize(body)
    
    tagged = []
    for sentence in sentences:
        tagged.append(bigram_tagger.tag(sentence.split()))
    
    summary = []
    for sentence in tagged:
        for (word, tag) in sentence:
            if tag == 'PPSS' and word.lower() == 'we':
                summary.append(' '.join(nltk.tag.untag(sentence)))

    return summary
Exemple #47
0
# -*- coding: utf-8 -*-
"""
Created on Thu Jun 16 09:41:46 2016

@author: U505118
"""

import pandas as pd
import re
import unidecode
import nltk


from nltk.tokenize.punkt import PunktSentenceTokenizer
from nltk.tokenize.punkt import PunktParameters
punkt = PunktParameters()
punkt.abbrev_types = ['u.s.a', 'ltd', 'inc', 'no']
sen = PunktSentenceTokenizer(punkt)

for k in range(21, 87, 3):
    df = pd.read_csv('C:/Users/U505118/Desktop/P/10_K/outt'+str(k)+'01.csv')
    
    textl = []
    i = 0
    index = []
    
    for x in df['fact']:
        if type(x) is str:
            if len(x) > 100:
                #x = unidecode.unidecode(x)
                x = re.sub(r'<.*?>', ' ', x)
Author: Michael J Bommarito II <*****@*****.**>
Date: 2014-05-24
"""

# NLTK imports
from nltk.corpus import stopwords
from nltk.tokenize.punkt import PunktSentenceTokenizer, PunktWordTokenizer, \
    PunktParameters

# Import stopword list
english_stopwords = stopwords.words('english')


# Customizer sentence tokenizer
punkt_param = PunktParameters()
punkt_param.abbrev_types = [x.lower().strip() for x in \
                                set(['id', 'al', 'mr', 'mrs',
                                         'prof', 'inc', 'llc', 'co',
                                         'llp', 'pp', 'f', 'app',
                                         '2d', '3d', 'ch', 's', 'us',
                                         'cert', 'rev', 'i', 'ii', 'iii',
                                         'a', 'b', 'c', 'd', 'e', 'f',
                                         'g', 'h', 'i', 'j', 'k', 'l',
                                         'm', 'n', 'o', 'p', 'q', 'r', 's',
                                         't', 'u', 'v', 'w', 'x', 'y', 'z',
                                         ')', 'no', 'cir', 'ca', 'c.a',
                                         'fed', 'sec', 
                                         'jan', 'feb', 'mar', 'apr', 'jun',
                                         'jul', 'aug', 'sep', 'oct', 'nov',
                                         'dec', 'ala', 'vt', 'st', 'u.s', 
Exemple #49
0
from cltk.tokenize.sentence import TokenizeSentence
from cltk.tokenize.word import WordTokenizer

# Would like to have this search through a CLTK_DATA environment variable
# Better to use something like make_cltk_path in cltk.utils.file_operations?
home = os.path.expanduser('~')
cltk_path = os.path.join(home, 'cltk_data')
if not os.path.isdir(cltk_path):
    os.makedirs(cltk_path)

word_tokenizer = WordTokenizer('latin')

if os.path.exists(cltk_path + 'latin/model/latin_models_cltk/tokenizers/sentence'):
    sent_tokenizer = TokenizeSentence('latin')
else:
    punkt_param = PunktParameters()
    abbreviations = ['c', 'l', 'm', 'p', 'q', 't', 'ti', 'sex', 'a', 'd', 'cn', 'sp', "m'", 'ser', 'ap', 'n', 'v', 'k', 'mam', 'post', 'f', 'oct', 'opet', 'paul', 'pro', 'sert', 'st', 'sta', 'v', 'vol', 'vop']
    punkt_param.abbrev_types = set(abbreviations)
    sent_tokenizer = PunktSentenceTokenizer(punkt_param)

# Latin Library
try:
    latinlibrary = PlaintextCorpusReader(cltk_path + '/latin/text/latin_text_latin_library', 
    '.*\.txt',
    word_tokenizer=word_tokenizer, 
    sent_tokenizer=sent_tokenizer, 
    encoding='utf-8')    
    pass
except IOError as e:
    pass
    # print("Corpus not found. Please check that the Latin Library is installed in CLTK_DATA.")
Exemple #50
0
def tokenize_latin_words(string):
    from cltk.tokenize.latin_exceptions import latin_exceptions

    assert isinstance(string, str), "Incoming string must be type str."

    def matchcase(word):
        # From Python Cookbook
        def replace(m):
            text = m.group()
            if text.isupper():
                return word.upper()
            elif text.islower():
                return word.lower()
            elif text[0].isupper():
                return word.capitalize()
            else:
                return word

        return replace

    replacements = [(r'mecum', 'cum me'),
                    (r'tecum', 'cum te'),
                    (r'secum', 'cum se'),
                    (r'nobiscum', 'cum nobis'),
                    (r'vobiscum', 'cum vobis'),
                    (r'quocum', 'cum quo'),
                    (r'quacum', 'cum qua'),
                    (r'quicum', 'cum qui'),
                    (r'quibuscum', 'cum quibus'),
                    (r'sodes', 'si audes'),
                    (r'satin', 'satis ne'),
                    (r'scin', 'scis ne'),
                    (r'sultis', 'si vultis'),
                    (r'similist', 'similis est'),
                    (r'qualist', 'qualis est')
                    ]

    for replacement in replacements:
        string = re.sub(replacement[0], matchcase(replacement[1]), string, flags=re.IGNORECASE)


    punkt_param = PunktParameters()
    abbreviations = ['c', 'l', 'm', 'p', 'q', 't', 'ti', 'sex', 'a', 'd', 'cn', 'sp', "m'", 'ser', 'ap', 'n', 'v', 'k', 'mam', 'post', 'f', 'oct', 'opet', 'paul', 'pro', 'sert', 'st', 'sta', 'v', 'vol', 'vop']
    punkt_param.abbrev_types = set(abbreviations)
    sent_tokenizer = PunktSentenceTokenizer(punkt_param)

    word_tokenizer = PunktLanguageVars()
    sents = sent_tokenizer.tokenize(string)

    enclitics = ['que', 'n', 'ue', 've', 'st']
    exceptions = enclitics
    exceptions = list(set(exceptions + latin_exceptions))

    tokens = []

    for sent in sents:
        temp_tokens = word_tokenizer.word_tokenize(sent)
        if temp_tokens[0].endswith('ne'):
            if temp_tokens[0].lower() not in exceptions:
                temp = [temp_tokens[0][:-2], '-ne']
                temp_tokens = temp + temp_tokens[1:]

        if temp_tokens[-1].endswith('.'):
            final_word = temp_tokens[-1][:-1]
            del temp_tokens[-1]
            temp_tokens += [final_word, '.']

        for token in temp_tokens:
            tokens.append(token)

    # Break enclitic handling into own function?
    specific_tokens = []

    for token in tokens:
        is_enclitic = False
        if token.lower() not in exceptions:
            for enclitic in enclitics:
                if token.endswith(enclitic):
                    if enclitic == 'n':
                        specific_tokens += [token[:-len(enclitic)]] + ['-ne']
                    elif enclitic == 'st':
                        if token.endswith('ust'):
                            specific_tokens += [token[:-len(enclitic) + 1]] + ['est']
                        else:
                            specific_tokens += [token[:-len(enclitic)]] + ['est']
                    else:
                        specific_tokens += [token[:-len(enclitic)]] + ['-' + enclitic]
                    is_enclitic = True
                    break
        if not is_enclitic:
            specific_tokens.append(token)

    return specific_tokens
Exemple #51
0
def tokenize_latin_words(string):
    """
    Tokenizer divides the string into a list of substrings
  
    >>> from cltk.corpus.utils.formatter import remove_non_ascii
    >>> text =  'Dices ἐστιν ἐμός pulchrum esse inimicos ulcisci.'
    >>> remove_non_ascii(text)
    >>> 'Dices   pulchrum esse inimicos ulcisci.
  
    :param string: This accepts the string value that needs to be tokenized
    :returns: A list of substrings extracted from the string
    """
    from cltk.tokenize.latin_exceptions import latin_exceptions

    assert isinstance(string, str), "Incoming string must be type str."

    def matchcase(word):
        # From Python Cookbook
        def replace(m):
            text = m.group()
            if text.isupper():
                return word.upper()
            elif text.islower():
                return word.lower()
            elif text[0].isupper():
                return word.capitalize()
            else:
                return word

        return replace

    replacements = [(r'mecum', 'cum me'),
                    (r'tecum', 'cum te'),
                    (r'secum', 'cum se'),
                    (r'nobiscum', 'cum nobis'),
                    (r'vobiscum', 'cum vobis'),
                    (r'quocum', 'cum quo'),
                    (r'quacum', 'cum qua'),
                    (r'quicum', 'cum qui'),
                    (r'quibuscum', 'cum quibus'),
                    (r'sodes', 'si audes'),
                    (r'satin', 'satis ne'),
                    (r'scin', 'scis ne'),
                    (r'sultis', 'si vultis'),
                    (r'similist', 'similis est'),
                    (r'qualist', 'qualis est')
                    ]

    for replacement in replacements:
        string = re.sub(replacement[0], matchcase(replacement[1]), string, flags=re.IGNORECASE)


    punkt_param = PunktParameters()
    abbreviations = ['c', 'l', 'm', 'p', 'q', 't', 'ti', 'sex', 'a', 'd', 'cn', 'sp', "m'", 'ser', 'ap', 'n', 'v', 'k', 'mam', 'post', 'f', 'oct', 'opet', 'paul', 'pro', 'sert', 'st', 'sta', 'v', 'vol', 'vop']
    punkt_param.abbrev_types = set(abbreviations)
    sent_tokenizer = PunktSentenceTokenizer(punkt_param)

    word_tokenizer = PunktLanguageVars()
    sents = sent_tokenizer.tokenize(string)

    enclitics = ['que', 'n', 'ue', 've', 'st']
    exceptions = enclitics
    exceptions = list(set(exceptions + latin_exceptions))

    tokens = []

    for sent in sents:
        temp_tokens = word_tokenizer.word_tokenize(sent)
        # Need to check that tokens exist before handling them; needed to make stream.readlines work in PlaintextCorpusReader
        
        if temp_tokens:
            if temp_tokens[0].endswith('ne'):
                if temp_tokens[0].lower() not in exceptions:
                    temp = [temp_tokens[0][:-2], '-ne']
                    temp_tokens = temp + temp_tokens[1:]

            if temp_tokens[-1].endswith('.'):
                final_word = temp_tokens[-1][:-1]
                del temp_tokens[-1]
                temp_tokens += [final_word, '.']

            for token in temp_tokens:
                tokens.append(token)

    # Break enclitic handling into own function?
    specific_tokens = []

    for token in tokens:
        is_enclitic = False
        if token.lower() not in exceptions:
            for enclitic in enclitics:
                if token.endswith(enclitic):
                    if enclitic == 'n':
                        specific_tokens += [token[:-len(enclitic)]] + ['-ne']
                    elif enclitic == 'st':
                        if token.endswith('ust'):
                            specific_tokens += [token[:-len(enclitic) + 1]] + ['est']
                        else:
                            specific_tokens += [token[:-len(enclitic)]] + ['est']
                    else:
                        specific_tokens += [token[:-len(enclitic)]] + ['-' + enclitic]
                    is_enclitic = True
                    break
        if not is_enclitic:
            specific_tokens.append(token)

    return specific_tokens
Exemple #52
0
def sent_tokenize(data):
    punkt_param = PunktParameters()
    punkt_param.abbrev_types = set(
        ['dr', 'vs', 'mr', 'mrs', 'prof', 'inc', 'et', 'al', 'Fig', 'fig'])
    sent_detector = PunktSentenceTokenizer(punkt_param)
    sentences = sent_detector.tokenize(data)
    offsets = sent_detector.span_tokenize(data)
    new_sentences = deepcopy(sentences)
    new_offsets = deepcopy(offsets)
    for i, off in enumerate(offsets):
        if len(tokenizer.tokenize(sentences[i])) < 7:  # Skip short sentences
            pass
        else:
            if i < len(offsets) - 1:
                if ((offsets[i + 1][0] - offsets[i][1]) < 5):
                    new_sentences.append(sentences[i] + ' ' + sentences[i + 1])
                    new_offsets.append((offsets[i][0], offsets[i + 1][1]))
            if i < len(offsets) - 2:
                if ((offsets[i + 2][0] - offsets[i + 1][1]) < 5) and\
                        ((offsets[i + 1][0] - offsets[i][0]) < 5):
                    new_sentences.append(
                        sentences[i] + ' ' + sentences[i + 1] + ' ' + sentences[i + 2])
                    new_offsets.append((offsets[i][0], offsets[i + 2][1]))
    #         if i < len(offsets) - 3:
    #             if (((offsets[i + 3][0] - offsets[i + 2][1]) < 5) and
    #                     ((offsets[i + 2][0] - offsets[i + 1][0]) < 5) and
    #                     ((offsets[i + 1][0] - offsets[i][0]) < 5)):
    #                 new_sentences.append(sentences[
    #                                   i] + ' ' + sentences[i + 1] + ' ' + sentences[i + 2] + ' ' + sentences[i + 3])
    #                 new_offsets.append((offsets[i][0], offsets[i + 3][1]))
    #         if i < len(offsets) - 4:
    #             if (((offsets[i + 4][0] - offsets[i + 3][1]) < 5) and
    #                  ((offsets[i + 3][0] - offsets[i + 2][1]) < 5) and
    #                     ((offsets[i + 2][0] - offsets[i + 1][0]) < 5) and
    #                     ((offsets[i + 1][0] - offsets[i][0]) < 5)):
    # if i < len(offsets) - 3:
    #             if (((offsets[i + 3][0] - offsets[i + 2][1]) < 5) and
    #                     ((offsets[i + 2][0] - offsets[i + 1][0]) < 5) and
    #                     ((offsets[i + 1][0] - offsets[i][0]) < 5)):
    #                 new_sentences.append(sentences[
    #                                   i] + ' ' + sentences[i + 1] + ' ' + sentences[i + 2] + ' ' + sentences[i + 3])
    #                 new_offsets.append((offsets[i][0], offsets[i + 3][1]))
    #         if i < len(offsets) - 4:
    #             if (((offsets[i + 4][0] - offsets[i + 3][1]) < 5) and
    #                  ((offsets[i + 3][0] - offsets[i + 2][1]) < 5) and
    #                     ((offsets[i + 2][0] - offsets[i + 1][0]) < 5) and
    #                     ((offsets[i + 1][0] - offsets[i][0]) < 5)):
    #                 new_sentences.append(sentences[
    #                                   i] + ' ' + sentences[i + 1] + ' ' + sentences[i + 2] + ' ' + sentences[i + 3] + ' ' + sentences[i + 3])
    #                 new_offsets.append((offsets[i][0], offsets[i + 3][1]))
    #         if i < len(offsets) - 5:
    #             if (((offsets[i + 5][0] - offsets[i + 4][1]) < 5) and
    #                 ((offsets[i + 4][0] - offsets[i + 3][1]) < 5) and
    #                  ((offsets[i + 3][0] - offsets[i + 2][1]) < 5) and
    #                     ((offsets[i + 2][0] - offsets[i + 1][0]) < 5) and
    #                     ((offsets[i + 1][0] - offsets[i][0]) < 5)):
    #                 new_sentences.append(sentences[
    #                                   i] + ' ' + sentences[i + 1] + ' ' + sentences[i + 2] + ' ' + sentences[i + 3] + ' ' + sentences[i + 3])
    #                 new_offsets.append((offsets[i][0], offsets[i + 3][1]))      new_sentences.append(sentences[
    #                                   i] + ' ' + sentences[i + 1] + ' ' + sentences[i + 2] + ' ' + sentences[i + 3] + ' ' + sentences[i + 3])
    #                 new_offsets.append((offsets[i][0], offsets[i + 3][1]))
    #         if i < len(offsets) - 5:
    #             if (((offsets[i + 5][0] - offsets[i + 4][1]) < 5) and
    #                 ((offsets[i + 4][0] - offsets[i + 3][1]) < 5) and
    #                  ((offsets[i + 3][0] - offsets[i + 2][1]) < 5) and
    #                     ((offsets[i + 2][0] - offsets[i + 1][0]) < 5) and
    #                     ((offsets[i + 1][0] - offsets[i][0]) < 5)):
    #                 new_sentences.append(sentences[
    #                                   i] + ' ' + sentences[i + 1] + ' ' + sentences[i + 2] + ' ' + sentences[i + 3] + ' ' + sentences[i + 3])
    #                 new_offsets.append((offsets[i][0], offsets[i + 3][1]))
    print new_offsets
    return {'sentences': new_sentences, 'offsets': new_offsets}
Exemple #53
0
def sent_tokenize(data, filter_short=False, filter_verbless=False):
    """
    Tokenize sentences 

    Tokenize `data` into two arrays: sentences and offsets
    Returns a tuple (`sentences`,`offsets`)
    """
    punkt_param = PunktParameters()
    punkt_param.abbrev_types = set(
        ['dr', 'vs', 'mr', 'mrs', 'prof', 'inc', 'et', 'al', 'Fig', 'fig'])
    sent_detector = PunktSentenceTokenizer(punkt_param)
    sentences = sent_detector.tokenize(data)
    offsets = sent_detector.span_tokenize(data)
    new_sentences = []
    new_offsets = []
    to_del = []
    if filter_verbless:
        pos = pos_tagger.extract_nlp_batch()
        for i in range(sentences):
            okay = False
            for word in pos['sentences'][i]['words']:
                if word[1]['PartOfSpeech'] in verbs:
                    okay = True
                    break
            if not okay:  # the sentence doesn't have verb,
                to_del.append(i)  # mark for deletion
        sentences = multi_delete(sentences, to_del)
        offsetes = multi_delete(offsets, to_del)
    if filter_short and not filter_verbless:
        for i in range(len(sentences)):
            if len(sentences[i]) >= filter_short:
                new_sentences.append(sentences[i])
                new_offsets.append(new_offsets[i])
        new_sentences = [s for s in sentences if sentences]


#     new_sentences = deepcopy(sentences)
#     new_offsets = deepcopy(offsets)
#     for i, off in enumerate(offsets):
#         if i < len(offsets) - 1:
#             if ((offsets[i + 1][0] - offsets[i][1]) < 5):
#                 new_sentences.append(sentences[i] + ' ' + sentences[i + 1])
#                 new_offsets.append((offsets[i][0], offsets[i + 1][1]))
#         if i < len(offsets) - 2:
#             if ((offsets[i + 2][0] - offsets[i + 1][1]) < 5) and\
#                     ((offsets[i + 1][0] - offsets[i][0]) < 5):
#                 new_sentences.append(
#                     sentences[i] + ' ' + sentences[i + 1] + ' ' + sentences[i + 2])
#                 new_offsets.append((offsets[i][0], offsets[i + 2][1]))
#         if i < len(offsets) - 3:
#             if (((offsets[i + 3][0] - offsets[i + 2][1]) < 5) and
#                     ((offsets[i + 2][0] - offsets[i + 1][0]) < 5) and
#                     ((offsets[i + 1][0] - offsets[i][0]) < 5)):
#                 new_sentences.append(sentences[
#                                   i] + ' ' + sentences[i + 1] + ' ' + sentences[i + 2] + ' ' + sentences[i + 3])
#                 new_offsets.append((offsets[i][0], offsets[i + 3][1]))
#         if i < len(offsets) - 4:
#             if (((offsets[i + 4][0] - offsets[i + 3][1]) < 5) and
#                  ((offsets[i + 3][0] - offsets[i + 2][1]) < 5) and
#                     ((offsets[i + 2][0] - offsets[i + 1][0]) < 5) and
#                     ((offsets[i + 1][0] - offsets[i][0]) < 5)):
# if i < len(offsets) - 3:
#             if (((offsets[i + 3][0] - offsets[i + 2][1]) < 5) and
#                     ((offsets[i + 2][0] - offsets[i + 1][0]) < 5) and
#                     ((offsets[i + 1][0] - offsets[i][0]) < 5)):
#                 new_sentences.append(sentences[
#                                   i] + ' ' + sentences[i + 1] + ' ' + sentences[i + 2] + ' ' + sentences[i + 3])
#                 new_offsets.append((offsets[i][0], offsets[i + 3][1]))
#         if i < len(offsets) - 4:
#             if (((offsets[i + 4][0] - offsets[i + 3][1]) < 5) and
#                  ((offsets[i + 3][0] - offsets[i + 2][1]) < 5) and
#                     ((offsets[i + 2][0] - offsets[i + 1][0]) < 5) and
#                     ((offsets[i + 1][0] - offsets[i][0]) < 5)):
#                 new_sentences.append(sentences[
#                                   i] + ' ' + sentences[i + 1] + ' ' + sentences[i + 2] + ' ' + sentences[i + 3] + ' ' + sentences[i + 3])
#                 new_offsets.append((offsets[i][0], offsets[i + 3][1]))
#         if i < len(offsets) - 5:
#             if (((offsets[i + 5][0] - offsets[i + 4][1]) < 5) and
#                 ((offsets[i + 4][0] - offsets[i + 3][1]) < 5) and
#                  ((offsets[i + 3][0] - offsets[i + 2][1]) < 5) and
#                     ((offsets[i + 2][0] - offsets[i + 1][0]) < 5) and
#                     ((offsets[i + 1][0] - offsets[i][0]) < 5)):
#                 new_sentences.append(sentences[
#                                   i] + ' ' + sentences[i + 1] + ' ' + sentences[i + 2] + ' ' + sentences[i + 3] + ' ' + sentences[i + 3])
#                 new_offsets.append((offsets[i][0], offsets[i + 3][1]))      new_sentences.append(sentences[
#                                   i] + ' ' + sentences[i + 1] + ' ' + sentences[i + 2] + ' ' + sentences[i + 3] + ' ' + sentences[i + 3])
#                 new_offsets.append((offsets[i][0], offsets[i + 3][1]))
#         if i < len(offsets) - 5:
#             if (((offsets[i + 5][0] - offsets[i + 4][1]) < 5) and
#                 ((offsets[i + 4][0] - offsets[i + 3][1]) < 5) and
#                  ((offsets[i + 3][0] - offsets[i + 2][1]) < 5) and
#                     ((offsets[i + 2][0] - offsets[i + 1][0]) < 5) and
#                     ((offsets[i + 1][0] - offsets[i][0]) < 5)):
#                 new_sentences.append(sentences[
#                                   i] + ' ' + sentences[i + 1] + ' ' + sentences[i + 2] + ' ' + sentences[i + 3] + ' ' + sentences[i + 3])
#                 new_offsets.append((offsets[i][0], offsets[i + 3][1]))
    print new_offsets
    return {'sentences': new_sentences, 'offsets': new_offsets}
Exemple #54
0
def gather_input():

    #gather input
    for file in os.listdir("../scrapper/"):
        if file.endswith(".txt"):
            inputFile = file
    file = open("../scrapper/"+inputFile,"r")
    input = file.read()
    file.close()

    #os.remove("../scrapper/"+inputFile)

    #extract text
    reg_string=ur"\"text\":\"(.+?)[^\\]\""
    data_array=re.findall(reg_string,input)
    
    #extract location of tweet
    reg_string=ur"\"location\":\"(.*?)\""
    location_array=re.findall(reg_string,input)
        
    #extract whether retweeted or not
    reg_string = ur"\"retweeted\":(.+?),"
    retweet_bool=re.findall(reg_string,input)
    
    #today's date in YYYYMMDD format
    date = datetime.datetime.now()
    date = date.date()
    #date = date.strftime("%Y%m%d")

    ## calcualte the barrier date
    date_diff = int(sys.argv[1])
    DD = datetime.timedelta(days=date_diff)
    barrier_date = (datetime.datetime.now()- DD).date()

    ## load the whitelist and create array of arrays as - [noun,sentiment,count]
    file = open("../py_code/white_list.txt","r")
    white_list = []
    line = file.readline()
    while line:
        white_list.append([line.rstrip(),0,0])
        line = file.readline()
    file.close()

    ## create a sentence_tokenizer
    from nltk.tokenize.punkt import PunktSentenceTokenizer, PunktParameters
    punkt_param = PunktParameters()
    punkt_param.abbrev_types = set(['dr', 'vs', 'mr', 'mrs', 'prof', 'inc','1','2','3','4','5','6','7','8','9','10','11','12','13','14','15','16','17','18','19','20'])
    sent_tokenizer = PunktSentenceTokenizer(punkt_param)
    
    ## next step is to inject into the database
    db = MySQLdb.connect(host="localhost",user="******",passwd="{2qGq(22+5iU",db="Insights")
    cur = db.cursor()
    
    ##filter out those tweet which have prices in them - usually sales, or retweets
    i=0
    for text in data_array:
        if retweet_bool[i]!="false":
            pass
        else:        
            ##filter text as many users dont put space after full stop - which is essential to use sentence tokenizer
            data_array[i] = re.sub(r'([\.\?\!])(\w)', r'\1 \2', data_array[i])
            
            blob = TextBlob(data_array[i])
            blob_sentiment = int(blob.sentiment.polarity*1000)/1000.0
            sql = "INSERT INTO Phrases(Phrase,Sentiment,Location,Date) VALUES (\""+data_array[i]+"\", "+str(blob_sentiment)+", \""+location_array[i]+"\", \""+str(date)+"\")"
            cur.execute(sql)

            ## tokenize the tweets, for sentiment analysis
            sentences = sent_tokenizer.tokenize(data_array[i])

            if len(sentences) == 1:
                ##run through the whiteList array, for each find count, add count, sentiment to array
                for word in white_list:
                    if((sentences[0].lower()).find(word[0])!=-1):
                        word[1]=word[1]+blob_sentiment
                        word[2]=word[2]+1

                        
            else:
                for sentence in sentences:
                    ##run through the whiteList array, for each find count and sentiment, add count, sentiment to array
                    for word in white_list:
                        if((sentence.lower()).find(word[0])!=-1):
                            blob = TextBlob(sentence)
                            word[1]=word[1]+int(blob.sentiment.polarity*1000)/1000.0
                            word[2]=word[2]+1
                            
                           
        i=i+1
    db.commit()

    ### now integerate these into Sentiment db, if there is no entry for today insert phrase and create one
    sql = "SELECT * FROM Sentiment WHERE `Date` ='"+str(date)+"' LIMIT 1;"
    cur.execute(sql)
    if(cur.rowcount==0):
        for word in white_list:
            if(word[2]!=0):
                sql = "INSERT INTO Sentiment VALUES ('"+str(date)+"','"+word[0]+"','"+str(word[1])+"','"+str(word[2])+"');"
                cur.execute(sql)
                
    ### else get the entry in the table, add sentiment and count, store back
    else:
        for word in white_list:
            if(word[2]!=0):
                sql = "SELECT Sentiment,Count FROM Sentiment WHERE `Date` ='"+str(date)+"'AND `Phrase`='"+word[0]+"';"
                cur.execute(sql)
                for row in cur.fetchall():
                    new_sentiment = float(row[0])+word[1]
                    new_count = row[1]+word[2]
                sql = "UPDATE Sentiment SET `Sentiment`="+str(new_sentiment)+",`Count`="+str(new_count)+" WHERE `Date` ='"+str(date)+"'AND `Phrase`='"+word[0]+"';"
                cur.execute(sql)
    db.commit()
    
    ### now add all the sentiment and count for all phrases in the white list in the Sentiment db above the barrier_date, add to json those whose count is not zero
    total_sentiment = 0;
    total_count = 0;
    json_array = [];
    for word in white_list:
        sql = "SELECT Sentiment,Count FROM Sentiment WHERE `Date` >'"+str(barrier_date)+"'AND `Phrase`='"+word[0]+"';"
        cur.execute(sql)
        if(cur.rowcount!=0):
            for row in cur.fetchall():
                total_sentiment = total_sentiment+float(row[0])
                total_count = total_count+int(row[1])
            json_array.append({"noun": word[0], "sentiment": int(total_sentiment/total_count*1000)/1000.0, "count": total_count})
            total_sentiment = 0;
            total_count = 0;
    
    db.close()   
    print(json.dumps(json_array))
Exemple #55
0
#!/usr/bin/env python

"""TAGGING INSTRUCTIONS
   If you don't know what to tag it with, reject with 'n' or just give the generic 'yn' tag!
   But seriously, 'yn' is a generic tag, since pretty much everything can be a polar question.
   If you have any other tags, please don't put the 'yn' tag. Just use it if you don't want to
   reject a sentence but don't have an alternative tag.
   You can use multiple tags for one sentence! just separate them with spaces on the same line.
   If you see a shitty "sentence", just 'n' it. punkt the tokenizer sucks."""

from nltk.tokenize.punkt import PunktSentenceTokenizer, PunktParameters
import sys

punkt_param = PunktParameters()
punkt_param.abbrev_types = set(["dr", "vs", "mr", "mrs", "prof", "inc", "v"])
sentence_splitter = PunktSentenceTokenizer(punkt_param)

valid_tags = frozenset(["who", "what", "when", "where", "how", "why", "yn"])


def output_tags(f, sentence, tags):
    for tag in tags:
        f.write(sentence + "\n**" + tag.upper() + "**\n\n")


# cool shit stolen from http://stackoverflow.com/questions/14374181/moving-back-an-iteration-in-a-for-loop
def repeatable(it):
    buf, it = None, iter(it)
    while True:
        if buf is None:
            # the buffer is empty, send them the next elem
Exemple #56
0
from nltk.tokenize.punkt import PunktSentenceTokenizer, PunktParameters
punkt_param = PunktParameters()
punkt_param.abbrev_types = set(['dr', 'vs', 'mr', 'mrs', 'prof', 'inc'])
sentence_splitter = PunktSentenceTokenizer(punkt_param)
text = "is THAT what you mean, Mrs. Hussey?"
sentences = sentence_splitter.tokenize(text)

Exemple #57
0
import nltk
from nltk.tree import Tree
import os.path
from PreProcessing import parsers



#edit this when changind dirs
LangPaths =os.path.realpath("C:/users/rihanna/Documents/Pol/ThesisIt/SumMe/Summarizer/langdetector/profiles/")
tltagger = nltk.data.load("taggers/filipino_aubt.pickle") #filipino pos tagger

tlChunker = nltk.data.load("chunkers/filipino_ub.pickle")#filipino chunker here
enChunker = nltk.data.load("chunkers/conll2000_ub.pickle") #enChunkerhere


punkt_param  = PunktParameters() #creates an opening for tokenizer parameters.
punkt_param.abbrev_types = set(['gng','mr','mrs','dr','rep']) #abbreviations further accepted goes here

sentence_splitter = PunktSentenceTokenizer(punkt_param)
tokenized = ""
gateway = JavaGateway()
detector = gateway.entry_point
detector.init(LangPaths)

def LangDetect(str):
	return detector.detect(str)

def tokenizer(str):
    
    #print(wordpunct_tokenize(str))
    return wordpunct_tokenize(str)
def getSplitter():
    punkt_param = PunktParameters()
    punkt_param.abbrev_types = ABBREVS
    return PunktSentenceTokenizer(punkt_param)