Example #1
1
def parseTextToSentences(text):
    punkt_param = PunktParameters()
    punkt_param.abbrev_types = set(['dr', 'vs', 'mr', 'ms', 'mrs', 'prof', 'inc', 'no', 'e.g', 'i.e'])
    sentence_splitter = PunktSentenceTokenizer(punkt_param)
    data = text
    data = data.replace('?"', '? "').replace('!"', '! "').replace('."', '. "')

    sentences = []
    for para in data.split('\n'):
        if para:
            sentences.extend(sentence_splitter.tokenize(para))
    return sentences
Example #2
0
def get_todo_items(text):
    all_items = list()
    tokenizer = PunktSentenceTokenizer()
    sen_tokens = tokenizer.tokenize(text)

    for sen_token in sen_tokens:
        todo_items = list()
        tokens = nltk.word_tokenize(sen_token)

        tags = tagger.tag(tokens)
        stop_words = [word for (word, tag) in tags if tag in (tagVB, tagVBP)]

        ind = -1
        for word in stop_words:
            curr_ind = tokens.index(word)
            if curr_ind != 0 and tags[curr_ind - 1][1] in (tagCC, tagRB):
                to_ind = curr_ind - 1
            else: to_ind = curr_ind
            if ind != -1 and abs(to_ind - ind) > 1:
                todo_items.append(' '.join(tokens[ind:get_punctuation_index(tokens, ind, to_ind)]))
            elif ind != -1 and len(todo_items) > 0:
                last_ind = len(todo_items)
                todo_items[last_ind - 1] = ' '.join([todo_items[last_ind - 1], tokens[to_ind - 1]])
            ind = curr_ind

        if ind != -1 and abs(len(tokens) - ind) > 1:
            todo_items.append(' '.join(tokens[ind:get_punctuation_index(tokens, ind, len(tokens))]))
        elif ind != -1 and len(todo_items) > 0:
            last_ind = len(todo_items)
            todo_items[last_ind - 1] = ' '.join([todo_items[last_ind - 1], tokens[len(tokens) - 1]])

        all_items.extend(todo_items)

    return all_items
Example #3
0
    def sentence_tokenizer(self, untokenized_string, language):
        """Reads language .pickle for right language"""
        if language == 'greek':
            pickle_path = os.path.expanduser('~/cltk_data/greek/cltk_linguistic_data/tokenizers/sentence/greek.pickle')
            language_punkt_vars = PunktLanguageVars
            language_punkt_vars.sent_end_chars = ('.', ';')
            language_punkt_vars.internal_punctuation = (',', '·')
        elif language == 'latin':
            pickle_path = os.path.expanduser('~/cltk_data/latin/cltk_linguistic_data/tokenizers/sentence/latin.pickle')
            language_punkt_vars = PunktLanguageVars
            language_punkt_vars.sent_end_chars = ('.', '?', ':')
            language_punkt_vars.internal_punctuation = (',', ';')
        else:
            print("No sentence tokenizer for this language available.")

        with open(pickle_path, 'rb') as open_pickle:
            tokenizer = pickle.load(open_pickle)
        tokenizer.INCLUDE_ALL_COLLOCS = True
        tokenizer.INCLUDE_ABBREV_COLLOCS = True
        params = tokenizer.get_params()
        sbd = PunktSentenceTokenizer(params)
        tokenized_sentences = []
        for sentence in sbd.sentences_from_text(untokenized_string,
                                                realign_boundaries=True):
            tokenized_sentences.append(sentence)
        return tokenized_sentences
Example #4
0
    def get_key_sentences(self, n=5):
        '''
        Uses a simple implementation of TextRank to extract the top N sentences
        from a document.

        Sources:
        - Original paper: http://acl.ldc.upenn.edu/acl2004/emnlp/pdf/Mihalcea.pdf
        - Super useful blog post: http://joshbohde.com/blog/document-summarization
        - Wikipedia: http://en.wikipedia.org/wiki/Automatic_summarization#Unsupervised_keyphrase_extraction:_TextRank
        '''
        # Tokenize the document into sentences. More NLP preprocesing should also happen here. 
        sentence_tokenizer = PunktSentenceTokenizer()
        sentences = sentence_tokenizer.tokenize(self.doc)

        # Calculate word counts and TFIDF vectors
        word_counts = CountVectorizer(min_df=0).fit_transform(sentences)
        normalized = TfidfTransformer().fit_transform(word_counts) 

        # Normalized graph * its transpose yields a sentence-level similarity matrix
        similarity_graph = normalized * normalized.T
     
        nx_graph = nx.from_scipy_sparse_matrix(similarity_graph)
        scores = nx.pagerank(nx_graph)
        return sorted(((scores[i],s) for i,s in enumerate(sentences)),
                      reverse=True)[n]
 def fractal_representation(self):
     punkt_param = PunktParameters()
     for each_paragraph in self.paragraphs:
         buffer_p = paragraph()
         buffer_p.paragraph = each_paragraph
         buffer_p.tokens = nltk.word_tokenize(preprocess(each_paragraph))
         buffer_p.weights['words'] = FreqDist(buffer_p.tokens)
         buffer_p.weights['total'] = {'words':0, 'sentences':0}    
         punkt_param.abbrev_types = set(['dr', 'vs', 'mr', 'mrs', 'prof', 'inc'])
         sentence_splitter = PunktSentenceTokenizer(punkt_param)
         sentences = sentence_splitter.tokenize(each_paragraph)
         for each_sentence in sentences:
             self.stotal += 1
             buffer_s = sentence()
             buffer_s.sentence = each_sentence
             buffer_s.tokens = nltk.word_tokenize(preprocess(each_sentence))
             if len(buffer_s.tokens) > 0:
                 buffer_s.weights['sentence'] = FreqDist(buffer_s.tokens)
                 buffer_s.weights['paragraph'] = self.calculate_relative_frequence(buffer_s.tokens, buffer_p.weights['words'])
                 buffer_s.weights['document'] = self.calculate_relative_frequence(buffer_s.tokens, self.fractal.weights)
                 buffer_s.weights['total'] = {}
                 buffer_s.weights['total']['sentence'] = 1
                 buffer_s.weights['total']['paragraph'] = sum(buffer_s.weights['paragraph'].values())
                 buffer_s.weights['total']['document'] = sum(buffer_s.weights['document'].values())
                 self.s_weight += buffer_s.weights['total']['document']
                 buffer_p.weights['total']['sentences'] += buffer_s.weights['total']['document']
                 buffer_p.sentences.append(buffer_s)
         buffer_p.weights['total']['words'] = sum(buffer_p.weights['words'].values())
         self.fractal.paragraphs.append(buffer_p)
         self.pindex += 1
Example #6
0
def textrank(document):
    pst = PunktSentenceTokenizer()
    sentences = pst.tokenize(document)

    # Bag of Words
    from sklearn.feature_extraction.text import CountVectorizer
    cv = CountVectorizer()
    bow_matrix = cv.fit_transform(sentences)

    from sklearn.feature_extraction.text import TfidfTransformer
    normalized_matrix = TfidfTransformer().fit_transform(bow_matrix)

    ## mirrored matrix where the rows and columns correspond to 
    ## sentences, and the elements describe how similar the
    ## sentences are. score 1 means sentences are exactly the same.
    similarity_graph = normalized_matrix * normalized_matrix.T
    similarity_graph.toarray()

    # PageRank
    import networkx as nx
    nx_graph = nx.from_scipy_sparse_matrix(similarity_graph)

    ## mapping of sentence indices to scores. use them to associate
    ## back to the original sentences and sort them
    scores = nx.pagerank(nx_graph)
    ranked = sorted(((scores[i], s) for i,s in enumerate(sentences)), reverse=True)
    print ranked[0][1]
def featureize(F, observation_files):

    word_tokenizer = PunktSentenceTokenizer()
    sent_tokenizer = PunktSentenceTokenizer()

    m = len(observation_files)

    # X is Nx2
    X = np.zeros((m,2), dtype=np.float)

    for (i,filename) in enumerate(observation_files,start=0):

        file_text  = read_file(filename).decode('string_escape')

        try:
            num_sents = len(sent_tokenizer.sentences_from_text(file_text))
        except UnicodeDecodeError:
            num_sents = 2

        #num_tokens = len(word_tokenize(file_text))
        num_tokens = len(file_text.split())

        # Return two features: 
        # 1 (0) - Number of sentences per file
        # 2 (1) - Number of tokens per file
        X[i][0] = num_sents
        X[i][1] = num_tokens

    return X
 def summarize(self):
     punkt_param = PunktParameters()
     punkt_param.abbrev_types = set(['dr', 'vs', 'mr', 'mrs', 'prof', 'inc'])
     sentence_splitter = PunktSentenceTokenizer(punkt_param)
     sentences = sentence_splitter.tokenize(self.text)
     structure = {}
     sentence_objects = []
     for idx in range(len(sentences)):
         obj = {'text' : sentences[idx], 'index' : idx , 'data': {}}
         sentence_objects.append(obj)
     structure['sentences'] = sentence_objects
     self.sentencecount = len(structure['sentences'])
     structure['ordered'] = []
     structure['weights'] = {'words' : FreqDist(nltk.word_tokenize(preprocess(self.text))), 'total': 0, 'transformed': 0}
     structure['weights']['total'] = sum(structure['weights']['words'].values())
     self.sentenceIndex = 0
     for each_sent in structure['sentences']:
         each_sent['data']['tokens'] = nltk.word_tokenize(preprocess(each_sent['text']))
         each_sent['data']['sinTransform'] = (1-math.sin(self.sentenceIndex*(math.pi/self.sentencecount)))+1
         for each_word in structure['weights']['words']:
             if each_word in each_sent['data']['tokens']:
                 structure['weights']['words'][each_word] *= each_sent['data']['sinTransform']
         self.sentenceIndex += 1
     structure['weights']['transformed'] = sum(structure['weights']['words'].values())
     self.sentenceIndex = 0
     for each_sent in structure['sentences']:
         each_sent['data']['weights'] = {'words': self.calculate_relative_frequence(each_sent['data']['tokens'], structure['weights']['words']), 'total': 0}
         each_sent['data']['weights']['total'] = sum(each_sent['data']['weights']['words'].values())
         self.sentenceIndex += 1
     structure['ordered'] = sorted(structure['sentences'], key=lambda x:x['data']['weights']['total'], reverse=True)
     structure_keep = structure['ordered'][:self.quota]
     structure_keep.sort(key=lambda x:x['index'])
     for eac_sen in structure_keep:
         self.summary.append(eac_sen['text'])
Example #9
0
    def tokenize_sentences(self, untokenized_string: str):
        """Tokenize sentences by reading trained tokenizer and invoking
        ``PunktSentenceTokenizer()``.
        :type untokenized_string: str
        :param untokenized_string: A string containing one of more sentences.
        :rtype : list of strings
        """
        # load tokenizer
        assert isinstance(untokenized_string, str), \
            'Incoming argument must be a string.'

        if self.language == 'latin':
            tokenizer = super()
        elif self.language == 'greek': # Workaround for regex tokenizer
            self.sent_end_chars=GreekLanguageVars.sent_end_chars
            self.sent_end_chars_regex = '|'.join(self.sent_end_chars)
            self.pattern = rf'(?<=[{self.sent_end_chars_regex}])\s'
        elif self.language in INDIAN_LANGUAGES:
            self.sent_end_chars=SanskritLanguageVars.sent_end_chars
            self.sent_end_chars_regex = '|'.join(self.sent_end_chars)
            self.pattern = rf'(?<=[{self.sent_end_chars_regex}])\s'
        else:
            # Warn that NLTK Punkt is being used by default???
            tokenizer = PunktSentenceTokenizer()

        # mk list of tokenized sentences
        if self.language == 'greek' or self.language in INDIAN_LANGUAGES:
            return re.split(self.pattern, untokenized_string)
        else:
            return tokenizer.tokenize(untokenized_string)
Example #10
0
def preprocess(phys):
    '''
    :param fname: a text file
    :return: a json of sentences, processed for searchability
    '''

    phys = phys.decode('utf-8')
    phys = re.sub('(\n)+', '. ', phys)

    sentence_tokenizer = PunktSentenceTokenizer()
    sentences = sentence_tokenizer.tokenize(phys)

    for i in xrange(len(sentences)):
        sentence = unicode(sentences[i])
        sentence = sentence.replace('\n', ' ')
        sentence = re.sub(' +',' ',sentence)
        sentence = re.sub(r'\d+', '', sentence)
        sentence = sentence.replace("-"," ")
        exclude = string.punctuation
        sentence = ''.join(ch for ch in sentence if ch not in exclude)
        sentence = re.sub(' +',' ',sentence)
        sentences[i] = sentence
        # sentences[i] = sentence.encode('utf-8')
    count = 0
    for sentence in sentences:
        if sentence == ' ' or sentence == '':
            sentences.pop(count)
        count +=1

    # with open(fname.rstrip('txt')+'json', 'w') as outfile:
    #     json.dump(sentences, outfile)

    return sentences
Example #11
0
    def preprocessin(self, cell_value):

        # to tokenize the tweet into sentences
        tweet = PunktSentenceTokenizer().tokenize(cell_value)
        # to remove 'u'
        tweet = '\n'.join(tweet)
        # to remove html tags
        tweet = self.remTags(tweet)
        # to lower aplphabets
        tweet = tweet.lower()

        ##Removing all junk
        tweet = re.sub(u'(RT |\\\\|\u201c)"?@.*?[: ]', ' ', tweet)
        tweet = re.sub('@', ' ', tweet)
        tweet = re.sub(r'[^\x00-\x7F]', ' ', tweet)
        tweet = re.sub('[\s]+', ' ', tweet)
        tweet = re.sub('_', ' ', tweet)
        tweet = re.sub('((www\.[\s]+)|(https?://[^\s]+))', '', tweet)
        tweet = re.sub(r'\\([^\s]+)', ' ', tweet)
        tweet = re.sub(u'[\u2018\u2019]', '\'', tweet)
        tweet = re.sub('(^|)?http?s?:?/?/?.*?( |$)', ' ', tweet)
        tweet = re.sub(u'\u2026', ' ', tweet)
        tweet = re.sub('---', ' ', tweet)
        tweet = re.sub(u'[\u201c\u201d]', '"', tweet)
        tweet = re.sub('\.?@.*?( |:|$)', ' ', tweet)
        tweet = re.sub(r"\.\.+", ' ', tweet)
        tweet = re.sub('&amp', ' ', tweet)
        tweet = re.sub('\.\.\.', ' ', tweet)
        tweet = tweet.strip('\'"')
        tweet = re.sub('(, |\.( |$))', ' ', tweet)
        tweet = re.sub('[][!"$*,/;<=>?@\\\\^_`{|}~]', ' ', tweet)
        tweet = re.sub('( - )', ' ', tweet)

        return tweet
Example #12
0
 def _split_sentences(self, text):
     from nltk.tokenize.punkt import PunktSentenceTokenizer, PunktParameters
     punkt_param = PunktParameters()
     punkt_param.abbrev_types = set(['dr', 'vs', 'mr', 'mrs', 'prof', 'inc'])
     sentence_splitter = PunktSentenceTokenizer(punkt_param)
     sentences = sentence_splitter.tokenize(text)
     return sentences
    def sentences(self):
        try:
            return self.sentences_list

        except(AttributeError):
            sentence_tokenizer = SentenceTokenizer()
            self.sentences_list = sentence_tokenizer.tokenize(self.corpus)
            return self.sentences_list
Example #14
0
def _punkt_sent_tokenize(text):
    '''
     Sentence segmentation using nltk PunktSentenceTokenizer.
    '''
    punkt_param = PunktParameters()
    punkt_param.abbrev_types = set(config.tokenize_abbrev)
    sentence_splitter = PunktSentenceTokenizer(punkt_param)
    return sentence_splitter.tokenize(text)
Example #15
0
 def transform(self,documents):
     sentence_splitter = PunktSentenceTokenizer()
     for doc in documents:
         if not 'sentences' in doc.ext:
             doc.ext['sentences'] = [s.strip() for s in sentence_splitter.tokenize(doc.text)]
     # for doc in documents:
     #     if not 'sentences' in doc.ext:
     #         doc.ext['sentences'] = [s.strip() for s in doc.text.split('.') if s]
     return documents
Example #16
0
 def tokenize(self):
     """
     Returns a list of tokenized sentences
     """
     sentence_tokenizer = PunktSentenceTokenizer()
     sentences = sentence_tokenizer.sentences_from_text(self.text)
     sentences = [sentence.split() for sentence in sentences]
     sentences = [[word.strip(",.?!") for word in sentence] for sentence in sentences]
     return sentences
Example #17
0
def parse (text):
    """Use nltk's PunktSentenceTokenizer to convert the text string into
    a list of English-language sentences."""

    punkt_param = PunktParameters()
    punkt_param.abbrev_types = set(ABBREVIATIONS)
    sentence_splitter = PunktSentenceTokenizer(punkt_param)

    return sentence_splitter.tokenize(preprocess(text))
	def bayesSentiment(self, text):
		from nltk.tokenize.punkt import PunktSentenceTokenizer
		from senti_classifier import senti_classifier

		# break up text into sentences
		stzr = PunktSentenceTokenizer()
		sents = stzr.tokenize(text)
		pos_score, neg_score = senti_classifier.polarity_scores(sents)
		#print pos_score, neg_score
		return [pos_score, neg_score]
def split_into_sentences(input_file_name, output_file_name):
    tokenizer = PunktSentenceTokenizer()

    with gzip.open(input_file_name) as input_file:
        with gzip.open(output_file_name, 'w') as sentence_file:
            for line in input_file:
                labelled_review = json.loads(line)
                tokenized_text = tokenizer.tokenize(labelled_review['text'])
                json.dump([tokenized_text, labelled_review['score']], sentence_file)
                sentence_file.write("\n")
Example #20
0
def analyse_hansard_file(filename='House of Representatives_2018_05_10_6091.xml'):
    # Word frequency analysis
    my_abbrev = ['\'m', '.', ',', '\'s', '(', ')', 'n\'t', '\'ve', ';', '$', ':', '\'', '?', '\'ll', '\'re']
    stoplist = set(stopwords.words('english') + my_abbrev)
    soup, sample = parse_hansard(filename)

    # Tokenisation, tagging, chunking
    sent_tokenizer = PunktSentenceTokenizer()
    # Stop breaking sentence at "No."
    sent_tokenizer._params.abbrev_types.add('no')
    #sentences = nltk.sent_tokenize(sample)
    # TODO: improve sentence tokenizer - still far from good
    sentences = sent_tokenizer.tokenize(sample)

    tokenized_sentences = [nltk.word_tokenize(sentence) for sentence in sentences]
    tagged_sentences = [nltk.pos_tag(sentence) for sentence in tokenized_sentences]
    chunked_sentences = nltk.ne_chunk_sents(tagged_sentences, binary=True)

    # Word frequency over all sentences
    tokens = []
    for sentence in tokenized_sentences:
        tokens += [word for word in sentence if word.lower() not in stoplist]
    display_freq(tokens)

    # Part-of-speech analysis
    tags = []
    for sentence in tagged_sentences:
        tags += sentence
    pos_analysis(tags, my_abbrev)

    # spaCy NER
    nlp = spacy.load('en_core_web_sm')
    doc = nlp(sample)
    # Find named entities, phrases and concepts
    ne_spacy = {}
    for entity in doc.ents:
        if entity.label_ in ne_spacy:
            ne_spacy[entity.label_] += [entity.text]
        else:
            ne_spacy[entity.label_] = [entity.text]
    logger.debug("Entity number per type: %s" % {k:len(v) for k,v in ne_spacy.items()})
    for k in ne_spacy.keys():
        display_freq(ne_spacy[k], 'Named entities (%s)' % (k,), top=20)

    # Interjection analysis
    parties = {}
    all_interjections = soup.find_all('interjection')
    for interjection in all_interjections:
        # Can be either a party or a role (Speaker, President, etc, ...)
        party = interjection.party.text or interjection.find('name', role='metadata').text
        if party in parties:
            parties[party] = parties[party] + 1
        else:
            parties[party] = 1
    logger.debug("%s interjections: %s" % (len(all_interjections), parties))
    def __init__(self, document):
        self.document = document

        self.sumLength = 10

        self.weights = {}
        self.invWeights = {}
        self.sumIndex = {}
        self.summary = {}

        tokenizer = PunktSentenceTokenizer()
        self.sentences = [sentence.lower() for sentence in tokenizer.tokenize(document)]
Example #22
0
	def textrank(self, document):
	    sentence_tokenizer = PunktSentenceTokenizer()
	    sentences = sentence_tokenizer.tokenize(document)

	    bow_matrix = CountVectorizer().fit_transform(sentences)
	    normalized = TfidfTransformer().fit_transform(bow_matrix)

	    similarity_graph = normalized * normalized.T

	    nx_graph = nx.from_scipy_sparse_matrix(similarity_graph)
	    scores = nx.pagerank(nx_graph)
	    return sorted(((scores[i],s) for i,s in enumerate(sentences)), reverse=True)
Example #23
0
def preprocess_doc(doc):
    sentence_tokenizer = PunktSentenceTokenizer()
    sentences = sentence_tokenizer.sentences_from_text(doc)    
    tokens = []
    for sentence in sentences:
        #sentence1 = sentence.split()
        sentence1 = neg_scope(sentence)
        tokens.extend(w for w in sentence1 if w.lower() not in stopwords.words("english"))
    for ii in xrange(len(tokens)):
        if tokens[ii][-1] == '.':
            tokens[ii] = tokens[ii][:-1]
    return tokens
Example #24
0
def myNLTKParser(document, tagger):
    lexical_diversity = len(document) / len(set(document)) * 1.0

    punkt_param = PunktParameters()
    # if any customized abbrev
    # punkt_param.abbrev_types = set(['dr', 'vs', 'mr', 'mrs', 'prof', 'inc'])

    # tokenize to sentence
    sentence_splitter = PunktSentenceTokenizer(punkt_param)
    sentences = sentence_splitter.tokenize(document.replace("'s", "_s"))

    # tokenize sentence to words
    word_tokens = [[w.strip() for w in nltk.word_tokenize(s) if not w.strip().lower() in stopwords] for s in sentences]

    # extend token to bigram and trigram
    extended_tokens = []
    for token_list in word_tokens:
        extended_tokens.append(token_list + nltk.bigrams(token_list) + nltk.trigrams(token_list))

        # word stemmer to normalize
    p_stemmer = PorterStemmer()
    stem_tokens = []
    for token_list in word_tokens:
        stem_tokens.append([p_stemmer.stem(w) for w in token_list])

        # POS tags
    tags = [tagger.tag(a) for a in extended_tokens]

    tags_of_verbs = ["NN", "VB", "VBP", "VBG"]
    tags_of_interest = ["JJ", "JJR", "JJS", "NN", "NNP", "NNPS", "NNS", "RB", "RBR", "RBS"]
    tags_of_noun = ["NN"]
    merged_tags_uni = [
        word for sublist in tags for (word, tag) in sublist if tag in tags_of_verbs and isinstance(word, tuple) == False
    ]
    merged_tags_bi = [
        word
        for sublist in tags
        for (word, tag) in sublist
        if tag in tags_of_interest and isinstance(word, tuple) and len(word) == 2
    ]
    merged_tags_tri = [
        word
        for sublist in tags
        for (word, tag) in sublist
        if tag in tags_of_interest and isinstance(word, tuple) and len(word) == 3
    ]

    uni_tags_fd = nltk.FreqDist(merged_tags_uni)
    bi_tags_fd = nltk.FreqDist(merged_tags_bi)
    tri_tags_fd = nltk.FreqDist(merged_tags_tri)

    return {"uni_fd": uni_tags_fd.max(), "bi_fd": bi_tags_fd.max(), "tri_fd": tri_tags_fd.max()}
Example #25
0
def keyword_sentiment():

    ## take in tht input
    word = sys.argv[1]
    date_diff = int(sys.argv[2])
    
    ## create a sentence_tokenizer
    from nltk.tokenize.punkt import PunktSentenceTokenizer, PunktParameters
    punkt_param = PunktParameters()
    punkt_param.abbrev_types = set(['dr', 'vs', 'mr', 'mrs', 'prof', 'inc','1','2','3','4','5','6','7','8','9','10','11','12','13','14','15','16','17','18','19','20'])
    sent_tokenizer = PunktSentenceTokenizer(punkt_param)
    
    ## caluclate the barrier date
    DD = datetime.timedelta(days=date_diff)
    barrier_date = datetime.datetime.now()- DD

    ## make connection to db and fetch tweets (and respective sentiment) above the barrier_date
    db = MySQLdb.connect(host="localhost",user="******",passwd="{2qGq(22+5iU",db="Insights")
    cur = db.cursor()
    sql = "SELECT Phrase,Sentiment FROM Phrases WHERE `Date`>'"+str(barrier_date)+"';"
    cur.execute(sql)

    total_sentiment = 0
    total_count = 0
    ## locate tweets which contain keyword, tokenize them into sentences
    for row in cur.fetchall():
        if(row[0].lower().find(word.lower())!=-1):
            sentences = sent_tokenizer.tokenize(row[0])
            
    ## if a single sentence then just take the sentiment from db
            if len(sentences) == 1:
                total_sentiment = total_sentiment + float(row[1])
                total_count = total_count+1
                
    ## else add together sentiment of sentence and keep the count
            else:
                for sentence in sentences:
                        blob = TextBlob(sentence)
                        total_sentiment= total_sentiment + int(blob.sentiment.polarity*1000)/1000.0
                        if(sentence.lower().find(word.lower())!=-1):
                            total_count = total_count+1
                            
    ## json the total_sentiment/count and count
    if(total_count!=0):
        json_array = json_array = [{"sentiment": int(total_sentiment/total_count*1000)/1000.0, "count": total_count}]
    else:
        json_array = json_array = [{"sentiment": 0, "count": 0}]
    ## close the connection to the db
    db.close()
    ## print the json
    print(json.dumps(json_array))
Example #26
0
def splitIntoSentences2(file_name):
  punkt_param = PunktParameters()
  punkt_param.abbrev_types = set(['dr', 'vs', 'mr', 'mrs', 'prof', 'inc'])
  sentence_splitter = PunktSentenceTokenizer(punkt_param)
  fp = open(file_name)
  data = fp.read()
  data = data.replace('?"', '? "').replace('!"', '! "').replace('."', '. "')

  sentences = []
  for para in data.split('\n'):
    if para:
      sentences.extend(sentence_splitter.tokenize(para))
  # print '\n-----\n'.join(sentences)
  return sentences
def build_doc2vec_model(save_file=False):
    client = MongoClient()
    db = client['metacritic']
    coll = db['steam_games']

    all_games = list(coll.find({'user_review': {"$exists": "true"},
                            'total_user_reviews': {'$ne': 0},
                            'game_name': {'$not': re.compile("Demo")} }))

    plv = PunktSentenceTokenizer()
    # stemmer = PorterStemmer()

    labeled_sentences = []
    for game in all_games:
        game_name = game['game_name']
        user_data = game['user_review']
        # critic_data = game['critic_review']

        user_reviews = user_data['reviews']

        for user_review in user_reviews:
            review = user_review['review']
            review = review.encode('ascii', 'replace')
            review = str(review).translate(string.maketrans("",""), string.punctuation)
            review_sentence = [sentence.split() for sentence in plv.tokenize(review.lower())]

            if len(review_sentence) == 0: 
                continue
            else:
                review_sentence = review_sentence[0]
                # stemmed_sentence = []
                # for word in review_sentence[0]:
                #     stemmed_sentence.append(stemmer.stem(word))

            sentence = doc2vec.LabeledSentence(words=review_sentence, labels=[game_name])
            labeled_sentences.append(sentence)

    model = Doc2Vec(alpha=0.025, min_alpha=0.025, workers=4)#, train_words=False, train_lbls=True)
    model.build_vocab(labeled_sentences)

    for epoch in range(10):
        model.train(labeled_sentences)
        model.alpha -= 0.002  # decrease the learning rate
        model.min_alpha = model.alpha  # fix the learning rate, no decay

    if save_file:
        with open('data/model.pkl', 'wb') as f_model:
            pickle.dump(model, f_model)
    else:
        return model
Example #28
0
    def test_tokenize(self):

        train = "\n".join(itertools.imap(strip_tags, itertools.chain(*(speech['text'] for speech in self.speeches[0:10]))))

        print train
        tokenizer = PunktSentenceTokenizer(train)



        sents = tokenizer.tokenize(strip_tags(self.speeches[0]['text'][0]))

        sents = tokenize_sents(strip_tags(self.speeches[0]['text'][0]))

        self.assertEqual(len(sents), 3)
Example #29
0
    def loadCorpus(self, path):
        
        for encoding in self.__encodings:

            try:
                self.__path = path
                fileName = codecs.open( self.__path,'r', encoding=encoding )
                self.__rawText = fileName.read()
                break
            
            except UnicodeDecodeError:
                encoding = ''
                continue
                 
        if encoding!='':
            self.initFields()
            
            #SENTENCES
            # more abbreviations with dots
            punkt_param = PunktParameters()
            punkt_param.abbrev_types = set(['dr', 'vs', 'n', 'v', 'etc', 'art', 'p', 'Cost', 'ss', 'pag'])
            
            punkt_param = PunktParameters()
            sentence_splitter = PunktSentenceTokenizer(punkt_param)
            text = re.sub(ur'[\'\<\>`’]', ' ', self.__rawText)
            #text = re.sub('(\d+)', r' \1 ', text)
            sentences = sentence_splitter.tokenize(text)
            
            #TOKENS
            self.__tokens = [[token, ''] for token in list(itertools.chain(*[ customWordtokenize(sent) for sent in sentences]))]
            wordTokenizer = RegexpTokenizer('[a-zA-Z0-9\xe0\xe1\xe8\xe9\xec\xed\xf2\xf3\xf9\xfa]+')
            #wordTokenizer = RegexpTokenizer('[\w]+')
            
            sentences = [wordTokenizer.tokenize(sent.lower()) for sent in sentences if len(wordTokenizer.tokenize(sent)) > 0]
            words =  list(itertools.chain(*sentences))
            self.__words = words
            self.__sentences = sentences
            
            self.__avgSentLength = round(np.mean( [len(sent) for sent in sentences]), 3)
            self.__avgWordLength = round(np.mean( [len(word) for word in words]), 3)
            self.__freqDist = FreqDist(words)
            self.__wordCount = len(words)
            self.__lexicalDiversity = round(len(self.__freqDist.items())/float(len(words)), 5)
            
            ### resetting members
            self.__concordanceIndex = None
            self.__bigrams = None
                 
        return encoding
Example #30
0
def getSentences(paragraph):

	unicode_data= paragraph.decode("utf-8")
	data= "".join([i if ord(i) < 128 else "" for i in unicode_data])

	tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')
	punkt_params = PunktParameters()
	punkt_params.abbrev_types = set(['al',"inc","mr","dr","mrs","prof"])
	splitter = PunktSentenceTokenizer(punkt_params)

	sentences=splitter.tokenize(data)
	
	sentences1=filter_list(sentences)
	##print sentences1,"\n----------------------------------------------------------------------------"
	return sentences1
Example #31
0
    def __init__(self):
        self.modelfile = 'punket_tokenizer.pk'

        if os.path.exists(self.modelfile):
            self.tokenizer = self.punkt_tokenize_load()

        else:
            self.trainer = PunktTrainer()
            text = ""
            for file_id in gutenberg.fileids():
                text += gutenberg.raw(file_id)
            self.trainer.INCLUDE_ALL_COLLOCS = True
            self.trainer.train(text)
            self.tokenizer = PunktSentenceTokenizer(self.trainer.get_params())

            self.tokenizer._params.abbrev_types.add('dr')
            self.tokenizer._params.abbrev_types.add('mr')
            self.tokenizer._params.abbrev_types.add('mrs')
            self.tokenizer._params.abbrev_types.add('miss')
            self.tokenizer._params.abbrev_types.add('ms')
            self.tokenizer._params.abbrev_types.add('no')

            self.tokenizer._params.abbrev_types.add('jan')
            self.tokenizer._params.abbrev_types.add('feb')
            self.tokenizer._params.abbrev_types.add('mar')
            self.tokenizer._params.abbrev_types.add('apr')
            self.tokenizer._params.abbrev_types.add('may')
            self.tokenizer._params.abbrev_types.add('jun')
            self.tokenizer._params.abbrev_types.add('aug')
            self.tokenizer._params.abbrev_types.add('sep')
            self.tokenizer._params.abbrev_types.add('oct')
            self.tokenizer._params.abbrev_types.add('nov')
            self.tokenizer._params.abbrev_types.add('dec')

            with open(self.modelfile, mode='wb') as fout:
                pickle.dump(self.tokenizer,
                            fout,
                            protocol=pickle.HIGHEST_PROTOCOL)
Example #32
0
def tokenize_text(seq):
    '''Tokenizes a string containing one or more sentences, and returns a
    list of lists, with the outer list representing sentences and the inner
    lists representing tokenized words within each sentence.  This does not
    remove stop words or do more advanced NL processing.'''

    def only_words(sent):
        # Takes a list and returns a version with only plausible words.
        return [w for w in sent if is_word(w)]

    def clean_words(sent):
        # Takes a list of words and cleans them to remove stray punctuation.
        return [re.sub(_stray_punct, '', word) for word in sent]

    # Replace common contractions that are safe to replace.
    replacer = RegexpReplacer(_common_contractions)
    text = replacer.replace(seq)
    # Compress multiple blank lines into one.
    text = re.sub(r'\n+', '\n', text)
    # Remove URLs.
    text = re.sub(url_compiled_regex, '', text)
    # Split words at certain characters that are not used in normal writing.
    text = str.translate(text, _odd_char_splitter)
    # Split the text into sentences.
    punkt_vars = ModifiedPunktLanguageVars()
    punkt_param = PunktParameters()
    punkt_param.abbrev_types = _common_abbrevs
    sentence_splitter = PunktSentenceTokenizer(punkt_param, lang_vars=punkt_vars)
    sentences = sentence_splitter.tokenize(text, realign_boundaries=True)
    # Tokenize each sentence individually.
    sentences = [nltk.word_tokenize(sent) for sent in sentences]
    # Filter out items that don't have any letters in them, or are too long.
    sentences = [only_words(sent) for sent in sentences]
    # Remove embedded quote characters & other oddball characters in strings.
    sentences = [clean_words(sent) for sent in sentences]
    # Remove blanks and return the result
    sentences = [x for x in sentences if x]
    return sentences
Example #33
0
    def setup_model(self,
                    model_path,
                    config=None,
                    label_file=None,
                    no_cuda=False):
        self.device = torch.device(
            "cuda" if torch.cuda.is_available() and not no_cuda else "cpu")
        # load labels
        self.labels_list = torch.load(label_file)
        # with open(label_file, "r", encoding="utf-8") as f:
        #     for line in f:
        #         line = line.strip().split("\t")
        #         self.labels_list.append(line)

        config = BertConfig.from_pretrained(config,
                                            num_labels=2,
                                            cache_dir=None,
                                            output_hidden_states=True)
        tokenizer = BertTokenizer.from_pretrained("bert-base-uncased",
                                                  do_lower_case=True,
                                                  cache_dir=None)

        model = MTDNNModelV2.from_pretrained(
            model_path,
            from_tf=bool(".ckpt" in model_path),
            config=config,
            labels_list=self.labels_list,
            task_list=TASK_LIST,
            do_task_embedding=False,
            do_alpha=False,
            do_adapter=False,
            num_adapter_layers=2)

        self.model_config = config
        self.tokenizer = tokenizer
        self.sent_tokenizer = PunktSentenceTokenizer()
        self.model = model
        self.model.to(self.device)
Example #34
0
    def sentoken(self, data):  #分句
        #token = nltk.data.load('tokenizers/punkt/english.pickle')
        #sents = token.tokenize(str(data))
        text = re.sub('\n', ' ', str(data))
        if isinstance(text, str):
            text = text
        else:
            raise ValueError('Document is not string!')
        point_re = re.compile(r'(\D)\.')
        text = re.sub(point_re, '\g<1>. ', str(text))
        #text = re.sub(r'\.', '. ', str(text))
        text = re.sub(r'\?', '? ', str(text))
        text = re.sub(r'!', '! ', str(text))
        text = re.sub(r'i\. e\. ', 'i.e.', str(text))
        text = text.strip()

        punkt_param = PunktParameters()
        abbreviation = ['i.e']
        punkt_param.abbrev_types = set(abbreviation)
        tokenizer = PunktSentenceTokenizer(punkt_param)
        sents = tokenizer.tokenize(text)
        sents = [sent.strip() for sent in sents]
        return sents
Example #35
0
class GCBlockExtractor(ExtractionMapper):
    def __init__(self):
        super(GCBlockExtractor,
              self).__init__(extraction_function=self._blocks_from_text)
        self.tokenizer = PunktSentenceTokenizer()

    def _blocks_from_text(self, page):
        blocks = []
        for sentence in self.tokenizer.sentences_from_text(
                page.text.replace('\n', '')):
            if sentence.strip():
                blocks.append(len(sentence))
            # maybe count tokens? or non-spaces?
        return blocks
Example #36
0
    def sentoken(self, data):  #分句
        #token = nltk.data.load('tokenizers/punkt/english.pickle')
        #sents = token.tokenize(str(data))
        text = re.sub('\n', ' ', str(data))  #将data里的换行符替换成空格
        if isinstance(text, str):
            text = text
        else:
            raise ValueError('Document is not string')
        point_re = re.compile(r'(\D)\.')  #用于匹配以.结尾的非数字字符串
        text = re.sub(point_re, '\g<1>. ', str(text))  #\g<1>相当于引用匹配置换前的内容
        text = re.sub(r'\.', '. ', str(text))
        text = re.sub(r'\?', '? ', str(text))
        text = re.sub(r'\!', '! ', str(text))
        text = re.sub(r'i\. e\. ', 'i.e.', str(text))  #复原被插入了空格的缩略语
        text = text.strip()  #移除字符串头尾指定的字符(默认为空格或换行符)

        punkt_param = PunktParameters()
        abbreviation = ['i.e']
        punkt_param.abbrev_types = set(abbreviation)  #自定义缩写词表,集合set是一个无序不重复的序列
        tokenizer = PunktSentenceTokenizer(punkt_param)
        sents = tokenizer.tokenize(text)
        sents = [sent.strip() for sent in sents]
        return sents
Example #37
0
File: lda.py Project: HsiaoCong/eg
def lda(document):

    #分句
    sentence_tokenizer = PunktSentenceTokenizer()
    sentences = sentence_tokenizer.tokenize(document)

    #计算词频
    c = CountVectorizer()
    bow_matrix = c.fit_transform(sentences)

    #print (bow_matrix.shape)

    #获取词袋模型中所有词语
    all_words = (c.get_feature_names())

    #index2word
    index2words = {v: k for k, v in c.vocabulary_.items()}

    lda = LatentDirichletAllocation(n_topics=2, max_iter=5)
    lda.fit(bow_matrix)

    print(lda.components_.shape)
    print(lda.transform(bow_matrix).shape)
Example #38
0
    def __init__(self,
                 strip_accents="unicode",
                 lowercase=True,
                 remove_html=True,
                 join_urls=True,
                 use_bigrams=True,
                 use_ner=True,
                 stanford_ner_path="",
                 use_lemmatizer=False,
                 use_stemmer=False):

        self.stanford_ner_path = stanford_ner_path  # path to stanford NER
        self.strip_accents = strip_accents  # options: {‘ascii’, ‘unicode’, None}
        self.lowercase = lowercase
        self.remove_html = remove_html
        self.join_urls = join_urls
        self.use_bigrams = use_bigrams
        self.use_ner = use_ner
        self.use_lemmatizer = use_lemmatizer  # use lemmatizer instead of stemmer?
        self.use_stemmer = use_stemmer

        # self.stanford_corenlp = StanfordCoreNLP(self.stanford_corenlp_path, memory="8g")
        self.sentence_splitter = PunktSentenceTokenizer(
        ).tokenize  # Punkt sentence splitter
        self.stemmer = SnowballStemmer("english").stem  # Snowball stemmer
        self.lemmatizer = WordNetLemmatizer().lemmatize  # WordNet lemmatizer
        self.base_tokenizer = CountVectorizer().build_tokenizer(
        )  # sklearn tokenizer works the best, I think...
        self.stop_words = stopwords.words(
            "english")  # nltk list of 128 stopwords
        self.token_pattern = re.compile(
            r"(?u)\b(\w*[a-zA-Z_]\w+|\w+[a-zA-Z_]\w*)\b"
        )  # default value was r"(?u)\b\w\w+\b"
        self.numeric_pattern = re.compile(r"^[0-9]+$")  # number regex
        self.url_pattern = re.compile(r"((http://)?(www\..*?\.\w+).*?)\s")
        self.compound_pattern = re.compile(r"\w+(\-\w+)+")

        if self.use_lemmatizer:
            self.tokenizer = CustomTokenizer(self.base_tokenizer,
                                             self.lemmatizer,
                                             self.token_pattern,
                                             self.numeric_pattern)
        elif self.use_stemmer:
            self.tokenizer = CustomTokenizer(self.base_tokenizer, self.stemmer,
                                             self.token_pattern,
                                             self.numeric_pattern)
        else:
            self.tokenizer = CustomTokenizer(self.base_tokenizer, lambda x: x,
                                             self.token_pattern,
                                             self.numeric_pattern)
Example #39
0
def create_sentences(text_file, min_sentence_len):
    trainer = PunktTrainer()
    trainer.INCLUDE_ALL_COLLOCS = True

    with open(text_file, "r") as input_file:
        paragraphs = input_file.read()

    trainer.train(paragraphs)

    tokenizer = PunktSentenceTokenizer(trainer.get_params())
    # print(tokenizer._params.abbrev_types)

    sentences = []

    for line in open(text_file, "r+").readlines():
        sentences_tmp = tokenizer.tokenize(line)
        for sentence in sentences_tmp:
            sentences.append(sentence)

    with open("dataset/sentences.txt", "a") as out_file:
        for sentence in sentences:
            if len(sentence) > min_sentence_len:
                out_file.write(sentence + "\n\n")
Example #40
0
def preprocess(doc):
    sentences = PunktSentenceTokenizer().tokenize(doc)
    corpus = []
    for sentence in sentences:
        temp = []
        words = nltk.word_tokenize(sentence)
        for word in words:
            word = re.sub(r'\W+', '', word)
            word = re.sub(r'_+', '', word)
            if (word != ''):
                temp.append(word.lower())
        if (len(temp) != 0):
            corpus.append(temp)
    return corpus
Example #41
0
def tokenize_sentences(input):
    clean_sentences = []
    from nltk.tokenize.punkt import PunktSentenceTokenizer, PunktParameters
    punkt_param = PunktParameters()
    punkt_param.abbrev_types = set(['dr', 'vs', 'mr', 'mrs', 'prof', 'inc'])
    sentence_splitter = PunktSentenceTokenizer(punkt_param)
    input = input.rstrip()
    sentences = sentence_splitter.tokenize(input)
    # erkenne Sätze, Satz-Tokenizer
    for sentence in sentences:
        sentence = sentence[:-1]  # Entferne den Punkt jeden Satzes
        if not (
                len(sentence)
        ) < 20:  # Wenn der Satz weniger als X Zeichen hat, beachte diesen nicht mehr
            sentence = sentence.replace(
                "\n", "")  # entferne \n und ersetze mit nichts
            sentence = sentence.replace(
                "/", " ")  # entferne / und ersetze mit Leerzeichen
            clean_sentences.append(
                "!$! " + sentence + " !€!"
            )  # Gebe Satz Start und Endsymbol und übergebe an Liste, clean_sentences
    #print(clean_sentences)
    return clean_sentences
Example #42
0
def textRank(document):
    sentence_tokenizer = PunktSentenceTokenizer()
    sentences = sentence_tokenizer.tokenize(document)

    bow_matrix = CountVectorizer().fit_transform(sentences)
    normalized = TfidfTransformer().fit_transform(bow_matrix)

    similarity_graph = normalized * normalized.T

    nx_graph = nx.from_scipy_sparse_matrix(similarity_graph)
    scores = nx.pagerank(nx_graph)
    text_rank_graph = sorted(((scores[i], s) for i, s in enumerate(sentences)),
                             reverse=True)
    number_of_nodes = int(0.25 * len(text_rank_graph))

    if number_of_nodes < 3:
        number_of_nodes = 3

    del text_rank_graph[number_of_nodes:]

    summary = ' '.join(word for _, word in text_rank_graph)

    return summary
Example #43
0
def create_sentence_tokens(class_num):
    if (class_num == 1):
        os.chdir(
            "C:\Users\MyPC\Desktop\Ass3\Ass3/20_newsgroups/comp.graphics/")
    else:
        os.chdir(
            "C:\Users\MyPC\Desktop\Ass3\Ass3/20_newsgroups/rec.motorcycles/")
    all_sentence_tokens = []
    for file in glob.glob("*"):
        f = open(file, 'rb')
        sentences = PunktSentenceTokenizer().tokenize(f.read())
        all_sentence_tokens += sentences
        f.close()
    return all_sentence_tokens
Example #44
0
def IsItPlagiarized():
    text_to_filter = request.form['text_to_check']
    if (text_to_filter.lstrip().rstrip() == ''):
        return render_template('plagiarizer-submit.html')
    punkt_param = PunktParameters()
    sentence_splitter = PunktSentenceTokenizer(punkt_param)
    sentences = sentence_splitter.tokenize(text_to_filter)
    probability_of_plagiarism = 0
    for a_sentence in sentences:
        # add a timer so we don't upset bing!
        time.sleep(0.3)
        content = filter(lambda x: x in string.printable, a_sentence)
        the_term = urllib.parse.quote('+' + '"' + str(content) + '"')
        page = requests.get('https://www.bing.com/search?q=' + the_term)
        if ((not "No results found for" in page.text)
                and (not "No hay resultados para" in page.text)
                and (not "are no results for" in page.text)):
            probability_of_plagiarism += 1
    is_it_plagiarized = str(
        (probability_of_plagiarism / len(sentences)) * 100) + '%'
    return render_template('plagiarizer-results.html',
                           text_to_filter=text_to_filter,
                           is_it_plagiarized=is_it_plagiarized)
Example #45
0
 def __init__(self, ignore_headers=True, raise_invalid_tags=False):
     """
     :param ignore_headers: If true, ignores text inside of the tags included in HEADER_ELEMENTS. This defaults to
     true because the text inside of these "header elements" is typically not a sentence.
     :param raise_invalid_tags: If true, raises an InvalidTagError when parsing a tag not in INLINE_ELEMENTS,
     BLOCK_LEVEL_ELEMENTS (which includes the elements of HEADER_ELEMENTS), SKIPPED_ELEMENTS, EMPTY_ELEMENTS, or
     SENTENCE_VOID_ELEMENTS. If false, ignores this tag and all of its children. (Sentences descending from it will
     not be included in the value returned from feed)
     """
     # self.parser is an etree parser by default.
     self.parser = html5lib.HTMLParser()
     self.walker = html5lib.getTreeWalker("etree")
     self.sentences = []
     self.ignored_parent_count = 0
     self.current_string = ''
     self.ignore_header_text = ignore_headers
     self.raise_invalid_tags = raise_invalid_tags
     punkt_param = PunktParameters()
     abbreviations = [
         'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M',
         'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z',
         'Adj', 'Adm', 'Adv', 'Asst', 'Bart', 'Bldg', 'Brig', 'Bros',
         'Capt', 'Cmdr', 'Col', 'Comdr', 'Con', 'Corp', 'Cpl', 'DR', 'Dr',
         'Drs', 'Ens', 'Gen', 'Gov', 'Hon', 'Hr', 'Hosp', 'Insp', 'Lt',
         'MM', 'MR', 'MRS', 'MS', 'Maj', 'Messrs', 'Mlle', 'Mme', 'Mr',
         'Mrs', 'Ms', 'Msgr', 'Op', 'Ord', 'Pfc', 'Ph', 'Prof', 'Pvt',
         'Rep', 'Reps', 'Res', 'Rev', 'Rt', 'Sen', 'Sens', 'Sfc', 'Sgt',
         'Sr', 'St', 'Supt', 'Surg', 'v', 'vs', 'i.e', 'inc', 'rev', 'e.g',
         'etc', 'Nos', 'Nr', 'pp', 'Jan', 'Feb', 'Mar', 'Apr', 'Jun', 'Jul',
         'Aug', 'Sep', 'Oct', 'Nov', 'Dec'
     ]
     punkt_param.abbrev_types = set(abbreviations)
     self.tokenizer = PunktSentenceTokenizer(punkt_param)
     logging.basicConfig(filename='html-tokenizer.log',
                         level=logging.WARNING,
                         format='[%(asctime)s] [%(levelname)s] %(message)s',
                         datefmt='%Y-%m-%d %H:%M:%S')
 def summarize(self):
     punkt_param = PunktParameters()
     punkt_param.abbrev_types = set(['dr', 'vs', 'mr', 'mrs', 'prof', 'inc'])
     sentence_splitter = PunktSentenceTokenizer(punkt_param)
     sentences = sentence_splitter.tokenize(self.text)
     structure = {}
     sentence_objects = []
     for idx in range(len(sentences)):
         obj = {'text' : sentences[idx], 'index' : idx , 'data': {}}
         sentence_objects.append(obj)
     structure['sentences'] = sentence_objects
     structure['ordered'] = []
     structure['weights'] = {'words' : FreqDist(nltk.word_tokenize(preprocess(self.text))), 'total': 0}
     structure['weights']['total'] = sum(structure['weights']['words'].values())
     for each_sent in structure['sentences']:
         each_sent['data']['tokens'] = nltk.word_tokenize(preprocess(each_sent['text']))
         each_sent['data']['weights'] = {}
         each_sent['data']['weights']['words'] = self.calculate_relative_frequence(each_sent['data']['tokens'], structure['weights']['words'])
         each_sent['data']['weights']['total'] = sum(each_sent['data']['weights']['words'].values())
     structure['ordered'] = sorted(structure['sentences'], key=lambda x:x['data']['weights']['total'], reverse=True)
     structure_keep = structure['ordered'][:self.quota]
     structure_keep.sort(key=lambda x:x['index'])
     for eac_sen in structure_keep:
         self.summary.append(eac_sen['text'])
Example #47
0
def semafor_local(text):
    semafor = join(dirname(__file__),'../{0}/bin/runSemafor.sh'.format(config.get('semafor', 'base_dir')))
    input_file = join(dirname(__file__),'../{0}/bin/in.txt'.format(config.get('semafor', 'base_dir')))
    with open(input_file, 'w') as f:
        tokenizer = PunktSentenceTokenizer()
        sentences = tokenizer.tokenize(text)
        f.write('\n'.join(sentences))
    output_file = join(dirname(__file__),'../{0}/bin/out.txt'.format(config.get('semafor', 'base_dir')))
    if isfile(output_file):
        remove(output_file)
    process = subprocess.Popen([semafor, input_file, output_file, '1'],
                           shell=False)
    out, err = process.communicate(text)
    if err:
        log.debug(err)

    sentences_semantics = []
    with open(output_file) as f:
        # semafor outputs an invalid JSON, with one dictionary per line
        for line in f:
            sentence_dict = json.loads(line.rstrip())
            sentences_semantics.append(sentence_dict)

    return sentences, sentences_semantics
Example #48
0
def sentence_splitter(lang):
    """

    :type lang: str
    :rtype: nltk.tokenize.punkt.PunktSentenceTokenizer
    """
    punkt_param = PunktParameters()
    path = os.path.dirname(__file__)
    ab_file = ''.join([path, SUBFOLDER, lang])
    if os.path.isfile(ab_file):
        punkt_param.abbrev_types = set(abbreviation_loader(ab_file))
    else:
        logging.info('Abbreviation file not found for language: %s', lang)
    splitter = PunktSentenceTokenizer(punkt_param)
    return splitter
Example #49
0
    def newPred(self, text):
        sentence_tokenizer = PunktSentenceTokenizer()
        sentences = sentence_tokenizer.tokenize(text)
        print(len(sentences))
        predList = PreprocessingClass().filter_sentences(text)

        with open('classifier.pickle', 'rb') as f:
            clf = pickle.load(f)

        with open('vectorize.pickle', 'rb') as f:
            vect = pickle.load(f)

        with open('tfidfmodel.pickle', 'rb') as f:
            tfidf = pickle.load(f)

        predText = vect.fit_transform(predList).toarray()
        predText = tfidf.transform(predText).toarray()
        new_pred = clf.predict(predText)
        finalSum = []
        for i, j in enumerate(new_pred):
            if j == 1:
                finalSum.append(sentences[i])

        return finalSum
Example #50
0
    def __init__(self, path):
        document = path
        if os.path.exists(path):
            with open(path, "r") as file:
                document = file.read().replace('\n', ' ')

        document = QUOTES.sub('', document)

        tfidf_vectorizer = TfidfVectorizer(stop_words='english')
        sentence_tokenizer = PunktSentenceTokenizer()

        self.sentences = sentence_tokenizer.tokenize(document)
        bow_matrix = tfidf_vectorizer.fit_transform(self.sentences)
        self.tfidf_features = tfidf_vectorizer.get_feature_names()

        sentence_similarity_matrix = bow_matrix * bow_matrix.T
        word_similarity_matrix = bow_matrix.T * bow_matrix

        self.sentence_nx_graph = nx.from_scipy_sparse_matrix(
            sentence_similarity_matrix)
        self.word_nx_graph = nx.from_scipy_sparse_matrix(
            word_similarity_matrix)
        self.__sentence_pagerank = None
        self.__word_pagerank = None
Example #51
0
def get_sentence_tokenizer(language):
    """
    Return the sentence tokenizer callable.
    """

    pickle_path = 'sentence_tokenizer.pickle'

    try:
        input_file = open(pickle_path, 'rb')
        sentence_tokenizer = load(input_file)
        input_file.close()
    except FileNotFoundError:

        data_file_paths = []

        sentences = []

        try:
            # Get the paths to each file the bot will be trained with
            corpus_files = list_corpus_files('core.corpus.{language}'.format(
                language=language.ENGLISH_NAME.lower()
            ))
        except LookupError:
            # Fall back to English sentence splitting rules if a language is not supported
            corpus_files = list_corpus_files('core.corpus.{language}'.format(
                language=languages.ENG.ENGLISH_NAME.lower()
            ))

        data_file_paths.extend(corpus_files)

        for corpus, _categories, _file_path in load_corpus(*data_file_paths):
            for conversation in corpus:
                for text in conversation:
                    sentences.append(text.upper())
                    sentences.append(text.lower())

        trainer = PunktTrainer()
        trainer.INCLUDE_ALL_COLLOCS = True
        trainer.train('\n'.join(sentences))

        sentence_tokenizer = PunktSentenceTokenizer(trainer.get_params())

        # Pickle the sentence tokenizer for future use
        output_file = open(pickle_path, 'wb')
        dump(sentence_tokenizer, output_file, -1)
        output_file.close()

    return sentence_tokenizer
Example #52
0
 def text_sentences(text):
     if isinstance(text, bytes):
         text = text.decode('utf-8')
     lines = []
     for line in text.splitlines(keepends=False) if isinstance(text, str) else text:
         line = fix_text(line.decode('utf-8') if isinstance(line, bytes) else line).strip()
         if len(line) <= 1: continue
         line = blanksre.sub(' ', line)
         lines.append(line)
     punkt_param = PunktParameters()
     punkt = PunktSentenceTokenizer(punkt_param)
     punkt.train('\n'.join(lines))
     r = []
     for line in lines:
         r.extend(punkt.tokenize(line))
     return r
    def test(data_generator):
        for _id, query, docs in data_generator:

            # tokenization
            tokenized_query = tokenizer.texts_to_sequences([query])[0]
            
            if queries_sw is not None:
                tokenized_query = [token for token in tokenized_query if token not in queries_sw] 
            
            for doc in docs:
                if isinstance(doc["text"], list):
                    continue # cached tokenization

                # sentence splitting
                new_docs = []
                
                _temp_new_docs = []
                doc["offset"] = []
                for start, end in PunktSentenceTokenizer().span_tokenize(doc["text"]):
                    _temp_new_docs.append(doc["text"][start:end])

                    if start<(len(doc["title"])-1):
                        doc["offset"].append(["title",(start, end), doc["text"][start:end], []])
                    else:
                        doc["offset"].append(["abstract", (start-len(doc["title"]), end-len(doc["title"])), doc["text"][start:end], []])

                _temp_new_docs = tokenizer.texts_to_sequences(_temp_new_docs)

                if docs_sw is not None:
                    for tokenized_docs in _temp_new_docs:
                        tokenized_docs = [token for token in tokenized_docs if token not in docs_sw]

                #doc["extra_features"] = compute_extra_features(tokenized_query, _temp_new_docs, idf_from_id_token)+[doc["score"]]

                for k,t_q in enumerate(tokenized_query):
                    new_docs.append([])
                    for l,_new_doc in enumerate(_temp_new_docs):
                        for i,t_d in enumerate(_new_doc):
                            if t_d==t_q:
                                new_docs[-1].append(_new_doc)
                                doc["offset"][l][-1].append(k)
                                break

                                                                    
                doc["text"] = new_docs
                                                                    
            yield _id, tokenized_query, docs
Example #54
0
class SentenceSplitter(object):
    def __init__(self):
        super(SentenceSplitter, self).__init__()
        self.sent_tokeniser_ = PunktSentenceTokenizer()
    
    def process(self, text, tokens):
        token_strs = [text[e[0] : e[1]] for e in tokens]
        
        sents = self.sent_tokeniser_.sentences_from_tokens(token_strs)
        curr = 0
        res_sents = list()
        for sent in sents:
            res_sents.append([Span(begin = e[0], end = e[1]) 
                              for e in tokens[curr : curr + len(sent)]])
            curr += len(sent)
        
        return res_sents
Example #55
0
def build_made_tokenizer(keep_token_strings=False):
    print('Building MADE tokenizer...')
    cs_preprocess_split_re_strings = []
    # double newlines
    cs_preprocess_split_re_strings.append(r'[\r\n]{2,}')
    # newlines with only spaces
    cs_preprocess_split_re_strings.append(r'[\r\n]+\s+[\r\n]+')
    # numbered lists (e.g. "1.", "2)")
    cs_preprocess_split_re_strings.append(r'(^|\r|\n)+\s*\d+[.)-]')
    # bulleted lists (e.g."*", "-")
    cs_preprocess_split_re_strings.append(r'(^|\r|\n)+\s*[*-]')
    # starting labels (e.g. "WEIGHT:")
    cs_preprocess_split_re_strings.append(r'(^|\r|\n)+\s*\w+[:]')
    # break up other lines separated by dates
    cs_preprocess_split_re_strings.append(
        r'(^|\r|\n)+\s*\d{1,2}[/-]\d{1,2}[/-]\d{2,4}')
    # MIMIC has many lines that start with this [**YYYY-M-DD**]
    cs_preprocess_split_re_strings.append(r'^\[\*+\d{4}-\d{1,2}-\d{1,2}\*+\]')
    # TIU notes have long bars like this : '***********' or '===========' or '------'
    cs_preprocess_split_re_strings.append(r'[*=-]{3,}')

    # NOTE : This breaking rule was disabled 2-13-18 since the UMass MADE challenge data often ended each line with 2 spaces and a
    # newline which caused this aggressive rule to fire over and over again.
    # aggressively break anything with lots of spaces (tabular data)
    #cs_preprocess_split_re_strings.append(r'\s{3,}')

    custom_lang_vars = CustomSentenceBreakingLangVars()
    custom_lang_vars.sent_end_chars = ('.', '!')
    print(custom_lang_vars.sent_end_chars)

    punkt_tokenizer2 = PunktSentenceTokenizer(lang_vars=custom_lang_vars)
    treebank_tokenizer = TreebankWordTokenizer()

    # looks like "pt." and "D.R." and "P.R." are already being handled
    #punkt_tokenizer2._params.abbrev_types.update(extra_abbrev)

    cs_tokenizer = basic.nlp.tokenizers.clinical_tokenizers.ClinicalSentenceTokenizer(
        default_sentence_tokenizer=punkt_tokenizer2,
        preprocess_split_re_strs=cs_preprocess_split_re_strings)

    made_index_tokenizer = basic.nlp.tokenizers.clinical_tokenizers.IndexTokenizer(
        cs_tokenizer,
        treebank_tokenizer,
        keep_token_strings=keep_token_strings)

    return made_index_tokenizer
Example #56
0
def generate_tweet(nltk_text):
	text = nltk_text.generate
	out = store_output(text, 10000)
	out = PunktSentenceTokenizer().tokenize(out)
	out = out[5:]	# get rid of initial jargon/repetition
	criteria = False
	num_tries = 0
	while not criteria and num_tries <= TRY_LIMIT:
		tweet = choice(out)
		tweet = pre_process(tweet)
		if len(tweet) > 80 and len(tweet) < 140:
			criteria = True
		else:
			num_tries += 1
	if criteria:
		return tweet
	else:
		return None
Example #57
0
def get_spliter():
    with open('sent_tokenize_model_v1.0.pkl', 'rb') as fs:
        punkt_param = pickle.load(fs)

    punkt_param.sent_starters = {}
    abbrev_types = ['g.m.t', 'e.g', 'dr', 'dr', 'vs', "000", 'mr', 'mrs', 'prof', 'inc', 'tp', 'ts', 'ths',
                    'th', 'vs', 'tp', 'k.l', 'a.w.a.k.e', 't', 'a.i', '</i', 'g.w',
                    'ass',
                    'u.n.c.l.e', 't.e.s.t', 'ths', 'd.c', 've…', 'ts', 'f.t', 'b.b', 'z.e', 's.g', 'm.p',
                    'g.u.y',
                    'l.c', 'g.i', 'j.f', 'r.r', 'v.i', 'm.h', 'a.s', 'bs', 'c.k', 'aug', 't.d.q', 'b…', 'ph',
                    'j.k', 'e.l', 'o.t', 's.a']
    abbrev_types.extend(string.ascii_uppercase)
    for abbrev_type in abbrev_types:
        punkt_param.abbrev_types.add(abbrev_type)
    for abbrev_type in string.ascii_lowercase:
        punkt_param.abbrev_types.add(abbrev_type)
    return PunktSentenceTokenizer(punkt_param)
Example #58
0
class SentenceTokenizer:
    def __init__(self):
        self.tokenizer = PunktSentenceTokenizer()

    def tokenize(self, string):
        instructions = string
        sentences = self.tokenizer.tokenize(instructions)
        standoffs = []
        lastStart = 0
        for sentence in sentences:
            startIdx = instructions.index(sentence, lastStart)
            endIdx = startIdx + len(sentence)
            standoffs.append(TextStandoff(string, (startIdx, endIdx)))
            lastStart = endIdx
        for s1 in standoffs:
            for s2 in standoffs:
                assert s1 == s2 or not s1.overlaps(s2)
        return standoffs
Example #59
0
class Summarization:
	def __init__(self,text):
		self.text=text
		self.text = ' '.join(self.text.strip().split('\n'))
		self.sentence_splitter = PunktSentenceTokenizer()
		self.sentences = self.sentence_splitter.tokenize(text)

	def tokenization(self):
		if(debug):
			return self.sentences
	def bag_of_words(self):
    		self.bag_of_words_matrix = CountVectorizer().fit_transform(self.sentences)
		if(debug):
			return self.bag_of_words_matrix
	
	def normalization(self):
		self.normalized_matrix = TfidfTransformer().fit_transform(self.bag_of_words_matrix)
		self.similarity_graph = self.normalized_matrix * self.normalized_matrix.T
		if(debug):
			return self.normalized_matrix

	def similarity(self):
		if(debug):
			return self.similarity_graph			

	def textrank(self):
		self.nx_graph = nx.from_scipy_sparse_matrix(self.similarity_graph)
		self.scores = nx.pagerank(self.nx_graph)
		self.sorted_text = sorted(((self.scores[i],s) for i,s in enumerate(self.sentences)),reverse=True)
		if(debug):
			print "\n\n"
			print "Scores.....\n"
			print self.sorted_text
		return self.sorted_text
		
	def summarized_text(self):
		self.summary=""
		for i in range(len(self.sorted_text)):
			self.summary+=self.sorted_text[i][1]
		self.summary = ' '.join(self.summary.strip().split('\n'))
		self.summary = ' '.join(self.summary.split())		
		return self.summary
def getrank(document):

    sentences = PunktSentenceTokenizer().tokenize(document)

    bow_matrix = CountVectorizer().fit_transform(sentences)
    normalized = TfidfTransformer().fit_transform(bow_matrix)

    similarity_graph = normalized * normalized.T

    nx_graph = networkx.from_scipy_sparse_matrix(similarity_graph)
    values = networkx.pagerank(nx_graph)
    sentence_array = sorted(((values[i], s) for i, s in enumerate(sentences)),
                            reverse=True)

    sentence_array = numpy.asarray(sentence_array)

    freq_max = float(sentence_array[0][0])
    freq_min = float(sentence_array[len(sentence_array) - 1][0])

    temp_array = []
    for i in range(0, len(sentence_array)):
        if freq_max - freq_min == 0:
            temp_array.append(0)
        else:
            temp_array.append((float(sentence_array[i][0]) - freq_min) /
                              (freq_max - freq_min))

    threshold = (sum(temp_array) / len(temp_array)) + 0.25

    sentence_list = []

    for i in range(0, len(temp_array)):
        if temp_array[i] > threshold:
            sentence_list.append(sentence_array[i][1])

    seq_list = []
    for sentence in sentences:
        if sentence in sentence_list:
            seq_list.append(sentence)

    return seq_list