def texttiling():
    conn = db_conn("map")
    cur = conn.cursor()
    tt = TextTilingTokenizer()
    # select all unique observation
    sql = "SELECT DISTINCT(observation) FROM utterances"
    cur.execute(sql)
    unique_observs = [t[0] for t in cur.fetchall()]
    # for each obsv
    for i, obsv in enumerate(unique_observs):
        sql = 'SELECT utterID, tagged FROM utterances WHERE observation = %s AND tagged <> ""'
        cur.execute(sql, [obsv])
        utter_id, tagged = zip(*cur.fetchall())
        text = "\n\n\n\t".join(tagged)
        try:
            segmented_text = tt.tokenize(text)
        except Exception as e:
            raise e
        else:
            uid_idx = 0
            for j, seg in enumerate(segmented_text):
                topic_id = j + 1
                sents = [s for s in seg.split("\n\n\n\t") if s != ""]
                for k, s in enumerate(sents):
                    in_topic_id = k + 1
                    sql = "UPDATE utterances SET topicID = %s, inTopicID = %s \
                        WHERE observation = %s AND utterID = %s"
                    cur.execute(sql, (topic_id, in_topic_id, obsv, utter_id[uid_idx]))
                    uid_idx += 1
                    conn.commit()
            sys.stdout.write("\r{}/{}".format(i + 1, len(unique_observs)))
            sys.stdout.flush()
def texttiling():
    conn = db_conn('map')
    cur = conn.cursor()
    tt = TextTilingTokenizer()
    # select all unique observation
    sql = 'SELECT DISTINCT(observation) FROM utterances'
    cur.execute(sql)
    unique_observs = [t[0] for t in cur.fetchall()]
    # for each obsv
    for i, obsv in enumerate(unique_observs):
        sql = 'SELECT utterID, tagged FROM utterances WHERE observation = %s AND tagged <> ""'
        cur.execute(sql, [obsv])
        utter_id, tagged = zip(*cur.fetchall())
        text = '\n\n\n\t'.join(tagged)
        try:
            segmented_text = tt.tokenize(text)
        except Exception as e:
            raise e
        else:
            uid_idx = 0
            for j, seg in enumerate(segmented_text):
                topic_id = j + 1
                sents = [s for s in seg.split('\n\n\n\t') if s != '']
                for k, s in enumerate(sents):
                    in_topic_id = k + 1
                    sql = 'UPDATE utterances SET topicID = %s, inTopicID = %s \
                        WHERE observation = %s AND utterID = %s'

                    cur.execute(
                        sql, (topic_id, in_topic_id, obsv, utter_id[uid_idx]))
                    uid_idx += 1
                    conn.commit()
            sys.stdout.write('\r{}/{}'.format(i + 1, len(unique_observs)))
            sys.stdout.flush()
Beispiel #3
0
def texttiling_text(text, k=20, w=40, smoothing_width=10, smoothing_rounds=5):
    tt = TextTilingTokenizer(stopwords=raw_stopword_list,
                             k=k,
                             w=w,
                             smoothing_width=smoothing_width,
                             smoothing_rounds=smoothing_rounds)

    o = tt.tokenize(text)
    return o
def split_pp_to_paragraphs(clean_pp, contractions_dict, pattern):
    """
    Uses TextTilingTokenizer to split to paragraphs, the
    privacy policy document should be pre-processed (HTML cleaned) before reaching this function.
    :param clean_pp: clean pp before expansion of contractions and special cases
    :param contractions_dict: a dictionary that includes all varieties of contractions and their expansion
    :param pattern: pattern for the expansion of contractions
    :return: list of paragraphs
    """
    clean_pp = clean_pp_advanced(clean_pp, contractions_dict, pattern)
    ttt = TextTilingTokenizer()
    paragraphs = ttt.tokenize(clean_pp)
    return paragraphs
Beispiel #5
0
def segments(txt):

    ttt = TextTilingTokenizer()
    tokens = ttt.tokenize(txt)

    start = 0
    end = 0
    tileSpan = []

    for token in tokens:
        end = start + len(token)
        tileSpan.append((start, end))
        start = end
    return tileSpan
Beispiel #6
0
def segments(txt):
    
    ttt = TextTilingTokenizer()
    tokens = ttt.tokenize(txt)
    
    start = 0
    end = 0
    tileSpan = []
    
    for token in tokens:
        end = start + len(token)
        tileSpan.append((start, end))
        start = end
    return tileSpan
def demo(text=None):
    from nltk.corpus import brown
    from matplotlib import pylab
    tt = TextTilingTokenizer(demo_mode=True)
    if text is None:
        text = brown.raw()[:10000]
    s, ss, d, b = tt.tokenize(text)
    pylab.xlabel("Sentence Gap index")
    pylab.ylabel("Gap Scores")
    pylab.plot(range(len(s)), s, label="Gap Scores")
    pylab.plot(range(len(ss)), ss, label="Smoothed Gap scores")
    pylab.plot(range(len(d)), d, label="Depth scores")
    pylab.stem(range(len(b)), b)
    pylab.legend()
    pylab.show()
class TopicTokenizer:
    """
    Text tiling tokenizer
    """
    def __init__(self,
                 cutoff_policy='HC',
                 stop_words=stopwords.words('english'),
                 w=20,
                 k=10):
        """
        Constructor
        """
        self.__stop_words = stop_words
        self.__cutoff_policy = cutoff_policy
        self.__w = w
        self.__k = k
        self.__tiler = TextTilingTokenizer(stopwords=stop_words,
                                           cutoff_policy=cutoff_policy,
                                           w=w,
                                           k=k)

    def get_boundaries(self, text):
        """
        Get potential topic boundaries between the text.

        :param text:    The text to tile
        :return:    A list of potential topics
        """
        topics = self.__tiler.tokenize(text)
        return topics

    def reload_tiler(self):
        """
        Reload the text tiler. Use if memory is an issue.
        """
        del self.__tiler
        self.__tiler = self.__tiler = TextTilingTokenizer(
            stopwords=self.__stop_words,
            cutoff_policy=self.__cutoff_policy,
            w=self.__w,
            k=self.__k)
Beispiel #9
0
def segment_transcript(doc):
    """doc is a document object with text lines
    in 'transcript',
    add a list of 'topics' to the document object
    and return it
    """

    tok = TextTilingTokenizer()

    lines = [turn['text'] for turn in doc['lines']]
    text = "\n\n".join(lines)

    doc['topics'] = []
    start = 0
    for topic in tok.tokenize(text):
        length = len(topic.strip().split('\n\n'))
        end = start + length
        doc['topics'].append({'start': start, 'end': end})
        start = end

    return doc
def texttiling_BNC():
    conn = db_conn('bnc')
    cur = conn.cursor()
    # select unique convId
    query = 'select distinct(convId) from entropy_DEM_full'
    cur.execute(query)
    conv_ids = [t[0] for t in cur.fetchall()]

    # for each convId, do texttiling, and update the episodeId and inEpisodeId columns
    tt = TextTilingTokenizer()
    for i, cid in enumerate(conv_ids):
        query = 'select strLower from entropy_DEM_full where convId = %s'
        cur.execute(query, [cid])
        text = '\n\n\n\t'.join([t[0] for t in cur.fetchall()])
        try:
            segmented = tt.tokenize(text)
        except Exception as e:
            exc_type, exc_obj, exc_tb = sys.exc_info()
            if str(exc_obj) == 'Input vector needs to be bigger than window size.' or \
                str(exc_obj) == 'No paragraph breaks were found(text too short perhaps?)': # it means the conversation is too short
                pass
            else:
                raise
        else:
            global_id = 1
            for j, seg in enumerate(segmented):
                epi_id = j + 1
                sents = [s for s in seg.split('\n\n\n\t') if s != '']
                for k, s in enumerate(sents):
                    in_epi_id = k + 1
                    # update
                    query = 'update entropy_DEM_full set episodeId = %s, inEpisodeId = %s \
                        where convId = %s and globalId = %s'
                    cur.execute(query, (epi_id, in_epi_id, cid, global_id))
                    global_id += 1
            # print progress
            sys.stdout.write('\r%s/%s updated' % (i+1, len(conv_ids)))
            sys.stdout.flush()
    # commit
    conn.commit()
Beispiel #11
0
class TexttileWrapper:
    def __init__(self):
        self._tt = TextTilingTokenizer()

    def sentence_array_texttile(self, sentences):
        text = "  \n\n".join(x for x in sentences if len(x) > 0) + "\n\n"
        tok = self._tt.tokenize(text)

        assignments = [0] * len(sentences)

        if tok:
            for ii in xrange(len(sentences)):
                try:
                    assignments[ii] = min(x for x in xrange(len(tok)) if sentences[ii] in tok[x]) + 1
                except ValueError:
                    print("ERROR %i!" % ii)
                    #print(text.encode("ascii", "ignore"))
                    #print(tok)
                    assignments[ii] = 0

        print "**************"
        print assignments

        # Make assignments monotonically increasing
        last_assignment = -1
        assignments_seen = -1
        for ii in xrange(len(assignments)):
            if assignments[ii] != last_assignment:
                assignments_seen += 1
            last_assignment = assignments[ii]
            assignments[ii] = assignments_seen
        print assignments

        return assignments


    def fallback_segmenter(self, text, max_sentence_length = 500,
                           arbitrary_words_per_sent = 30,
                           max_sentences_per_texttile = 15,
                           arbitrary_sentences_per_tile = 6):
        # First, try to segment into sentences with punkt
        sentences = punkt.tokenize(text)

        # If that doesn't work, use a really stupid regexp
        longest_sentence = max(len(x) for x in sentences)
        print ("Longest sentence is %i" % longest_sentence)
        if longest_sentence > 500:
            print "Using regexp sentence breaker"
            sentences = punct_regexp.findall(text)

        # If that still doesn't work, use arbitrary breaks
        if max(len(x) for x in sentences) > 600:
            print "Using ad hoc sentence breaker"
            sentences = []
            words = text.split()
            num_words = len(words)
            for ii in xrange(num_words // arbitrary_words_per_sent + 1):
                sentences.append(" ".join(words[ii * arbitrary_words_per_sent:
                                                min((ii + 1) * arbitrary_words_per_sent,
                                                    num_words)]))

        # Now feed that into texttile
        print(sentences)
        try:
            tile_assignments = self.sentence_array_texttile(sentences)
            tiles = set(tile_assignments)
        except ValueError:
            tile_assignments = None

        # If that doesn't work, split "sentences", however defined, into reasonable
        # sized chunks
        if tile_assignments == None or max(sum(1 for y in tile_assignments if y == x) for x in tiles) > max_sentences_per_texttile:
            tile_assignments = [x // arbitrary_sentences_per_tile for x in xrange(len(sentences))]

        return sentences, tile_assignments

    def fallback_wrapper(self, text):
        sentences, assignments = self.fallback_segmenter(text)

        num_sents = len(sentences)
        tiles = []
        for ii in xrange(max(assignments) + 1):
            tiles.append(" ".join(sentences[x] for x in xrange(num_sents) \
                                    if assignments[x] == ii))
        return tiles
Beispiel #12
0
def get_paragraphs_from_text(text):
    tiling_tokenizer = TextTilingTokenizer()
    paragraphs = tiling_tokenizer.tokenize(text)
    return paragraphs
Beispiel #13
0
from nltk import word_tokenize
import codecs
from argparse import ArgumentParser
import os

argparser = ArgumentParser()
argparser.add_argument('file', help="text document")
args = argparser.parse_args()

stopwords = stopwords.words('english')

doc_path = os.path.splitext(args.file)[0]

tt = TextTilingTokenizer()
text = codecs.open(doc_path + '.txt', 'r', "utf-8").read()
parags = tt.tokenize(text)

buffer_tiled = ''
buffer_tiled_tagged = ''
buffer_tiled_tagged_clean = ''

tagged_parags = pos_tag_sents([word_tokenize(p) for p in parags])
clean_parags = [
    filter(lambda taggedword: taggedword[0] not in stopwords, p)
    for p in tagged_parags
]

for i, p in enumerate(parags):
    buffer_tiled += p

    for word, tag in tagged_parags[i]:
Beispiel #14
0
    cur.execute(sql)
    convIDs = [tup[0] for tup in cur.fetchall()]
    convIDs.sort()

    # tokenizer
    tt = TextTilingTokenizer()

    # get the text of each convID, and do the TextTiling
    failed_convIDs = []
    for cid in convIDs:
        sql = 'SELECT rawWord FROM entropy_DEM100 WHERE convID = %s'
        cur.execute(sql, [cid])
        text = '\n\n\n\t'.join([tup[0] for tup in cur.fetchall()])

        try:
            segmented_text = tt.tokenize(text)
        except Exception, e:
            print 'convID %d failed' % cid
            failed_convIDs.append(cid)
        else:
            global_idx = 1
            for i, seg in enumerate(segmented_text):
                topic_idx = i + 1
                sents = [s for s in seg.split('\n\n\n\t') if s != '']
                for j, s in enumerate(sents):
                    in_topic_idx = j + 1
                    # update columns in table
                    sql = 'UPDATE entropy_DEM100 SET topicID = %s, inTopicID = %s WHERE convID = %s AND globalID = %s'
                    cur.execute(sql,
                                (topic_idx, in_topic_idx, cid, global_idx))
                    # increase global index
Beispiel #15
0
class TextHandler(object):
    '''
    Parse Out Individual Pieces of Text. 
    Tools utilizable from this API class include:
      -Text Tiling
      -RBF Networks
      -Fuzzy Clustering with Cosine Distances (custom and open source implementation of mine)
      -MNB recognition
      -sent tokenizing
      -minimum distance matching
    '''

    def __init__(self):
        '''
        Constructor
        '''
        self.__mnbClassifier=None
        self.__rbfClassifier=None
        self.__textTiler=None
        self.__vectorizer=None
        self.__tfidf=None
        self.__mnbTrained=False
        self.__clustMatrix=None
        self.__preds=[]
    
    
    def delMNB(self):
        '''
        GC Object
        '''
        self.__mnbClassifier=None
        self.__mnb=False
        gc.collect()
        del gc.garbage[:]
    
    def delRBF(self):
        '''
        GC Object
        '''
        self.__rbfClassifier=None
        gc.collect()
        del gc.garbage[:]
    

    
    def delTextTiler(self):
        '''
        GC Object
        '''
        self.__textTiler=None
        gc.collect()
        del gc.garbage[:]
    
    def delVectorizer(self):
        '''
        GC Object
        '''
        self.__vectorizer=None
        gc.collect()
        del gc.garbage[:]
    
    def delTfIdf(self):
        '''
        GC Object
        '''
        self.__tfidf=None
        gc.collect()
        del gc.garbage[:]
    
    def delClustMatrix(self):
        '''
        GC Object
        '''
        self.__clustMatrix=None
        gc.collect()
        del gc.garbage[:]
    
    
    def resetPreds(self):
        '''
        GC Object
        '''
        self.__preds=[]
        gc.collect()
        del gc.garbage[:]
    
    def getPreds(self):
        '''
        Return a list of the current prediction names.
        '''
        return self.__preds
    
    def getClustMatrix(self):
        '''
        Returns the cluster matrix
        '''
        return self.__clustMatrix
    
    
    def getNamedEntities(self):
        '''
        Parse out named entities
        '''
        pass
    
    def buildVectorizer(self,vector):
        '''
        Instantiates the vectorizers such as count vectorizer
        or tfidf vectorizer. This is useful when calling the vectorizer
        multiple times.
        
        *Required Parameters*
        :param vector: which vectorizer to build ('count','tfidf')
        
        '''
        if vector == 'count':
            self.__vectorizer=CountVectorizer(stop_words='english')
        elif vector == 'tfidf':
            self.__tfidf=TfidfTransformer(norm='l2')
    
    def trainVectorizers(self,document):
        '''
        Train the Vectorizers with a document that should be tokenized into sentences and words
        
        **Warning: All listed items will be concatenated to a single matrix**
        
        *Required Parameters*
        :param document: the document (text) or list of documents (file paths) to build count and tfidf vectorizers with (be as representative as possible)
        '''
        
        self.buildVectorizer('count')
        self.buildVectorizer('tfidf')
        
        if type(document) is str:
            self.__tfidf.fit(self.__vectorizer.fit_transform(document))
        else:
            uvecs=None
            sentences=[]
            for doc in document:
                if os.path.exists(doc) is True:
                    sentences=[]
                    with open(document,'r') as fp:
                        sentences.extend([parse(x,tags=False,chunks=False).split(" ") for x in self.__sent_tokenizer.tokenize(fp.read())])
        
            if uvecs is not None:
                self.__tfidf.fit(self.__vectorizer.fit(sentences))
                    
    def buildClassifier(self,classifier):
        '''
        Instantiates each of the Classifiers. Vectorizers
        should be built separately
        
        
        *Required Parameters*
        :param classifier: specify which type of classifier to build (mnb,rbf,sent,textTile)
        '''
        
        if classifier == 'mnb':
            self.__mnbClassifier=MultinomialNB()
        elif classifier == 'rbf':
            self.__rbfClassifier=SVC()
        elif classifier=='sent':
            self.__sent_tokenizer=nltk.data.load('tokenizers/punkt/english.pickle')
        elif classifier == 'textTile':
            self.__textTiler=TextTilingTokenizer()
        elif classifier=='clust':
            self.__clustMatrix=None
    
    def trainMinDistanceMat(self,cats):
        '''
        Train the Minmum Distance Cluster Matrix from given files
        **WARNING: This will turn a sparse vector to a dense vector**
        
        *Required Parameters*
        :param cats: categories to use
        '''
        for cat in cats:
            if os.path.exists(cat) is True:
                sentences=[]
                with open(cat,'r') as fp:
                    sentences=[parse(x,tags=False,chunks=False).split(" ") for x in self.__sent_tokenizer.tokenize(fp.read())]
                
                if len(sentences)>0:
                    vecs=self.__vectorizer.transform(sentences)
                    vecs=self.__tfidf.transform(vecs)
                    
                    if self.__clustMatrix is None:
                        self.__clustMatrix=vecs.mean()
                    else:
                        self.__clustMatrix=scipy.sparse.vstack((self.__clustMatrix,vecs.mean()))
    
    def getMinDistanceCategory(self,document):
        '''
        Find the best strength document via the trained cats to the document.
        
        Requires building the clustMatrix and vectorizers.
        
        **WARNING: This will turn a sparse vector to a dense vector**
        
        *Required Parameters*
        :param document: document to test with
        '''
        vecs=self.__tfidf.transform(self.__)
        
    
    def trainMNB(self,cats,partial=False):
        '''
        Train Multinomial Bayes to use in obtaining the appropriate weights for data. Please instantiate the sent_tokenizer
        
        *Required Parameters*
        :param cats: list of category directories
        
        *Optional Parameters*
        :param partial: whether to create a partial fit from the data (if using partial, please train the vectorizers first)
        ''' 
        self.__preds=[]
        gc.collect()
        del gc.garbage[:]
        
        if self.__vectorizer is None:
            self.buildVectorizer('count')
        if self.__tfidf is None:
            self.buildClassifier('tfidf')
        
        cl=[]

        uvecs=None
        for cat in cats:
            if os.path.exists(cat) is True:
                sentences=[]
                with open(cat,'r') as fp:
                    sentences=[parse(x,tags=False,chunks=False).split(" ") for x in self.__sent_tokenizer.tokenize(fp.read())]
                
                if len(sentences)>0:
                    vecs=self.__vectorizer.transform(sentences)
                    vecs=self.__tfidf.transform(vecs)
                    
                    if partial is True:
                        self.__preds.append(re.sub("\..*|\/","",os.path.split(cat)[1]).strip())
                        
                        for i in range(vecs.shape[0]):
                            cl.append(len(self.__preds)-1)
                        self.__mnbClassifier.partial_fit(vecs,numpy.asarray(cl))
                        cl=[]
                    else:
                        self.__preds.append(re.sub("\..*|\/","",os.path.split(cat)[1]).strip())
                        for i in range(vecs.shape[0]):
                            cl.append(len(self.__preds)-1)
                        
                        if uvecs is None:
                            uvecs=vecs
                        else:
                            uvecs=scipy.sparse.vstack((uvecs,vecs))
        
        if partial is False and uvecs is not None:
            self.__mnbClassifier.fit(uvecs, cl)
        
        del uvecs
        del cl
        gc.collect()
        del gc.garbage[:]
                        
    def classifyMNB(self,document):
        '''
        Multinomial Bayes Algorithm for fastest but least reliable results. Use only if the topics are clearly distinguishable.
        Requires building vectorizers and training MNB first. Returns the name and number of the category to work with
        
        *Required Parameters*
        :param document: document to classify
        '''
        sentences=numpy.asarray([parse(x,tags=False,chunks=False).split(" ") for x in self.__sent_tokenizer.tokenize(document)])
        vecs=self.__tfidf.transform(self.__vectorizer.transform(sentences))
        return [(self.__preds[x],x) for x in self.__mnbClassifier.predict(vecs)]
    
    def trainRBF(self,cats):
        '''
        Trains an RBF classifier for use in categorization.
        There is no Partial fit for a neural network. Everything must fit in memory.
        
        *Required Parameters*
        :param cats: list of category files to train on
        '''
        
        cl=[]
        uvecs=None
        self.__preds=[]
        gc.collect()
        del gc.garbage[:]
        for cat in cats:
            if os.path.exists(cat) is True:
                sentences=[]
                with open(cat,'r') as fp:
                    sentences=numpy.asarray([parse(x,tags=False,chunks=False).split(" ") for x in self.__sent_tokenizer.tokenize(fp.read())])
                
                if len(sentences)>0:
                    vecs=self.__vectorizer.transform(sentences)
                    vecs=self.__tfidf.transform(vecs)
                    
                    
                    if uvecs is None:
                        uvecs=vecs
                    else:
                        uvecs=scipy.sparse.vstack((uvecs,vecs))
                    self.__preds.append(re.sub("\..*|\/","",os.path.split(cat)[1]).strip())
                    for i in range(vecs.shape[0]):
                        cl.append(len(self.__preds)-1)
        
        if uvecs is not None:
            self.__rbfClassifier.fit(uvecs, cl)
        
        del cl
        del uvecs
        gc.collect()
        del gc.garbage[:]
    
    def classifyRBF(self,document):
        '''
        Classify with RBF Neural Network from SK Learn.
        
        Requires training count and self.__sent_tokenizer.tokenize,tfidf and count vectorizers, and the RBF classifier first
        
        *Required Parameter*
        :param document: text document to use
        '''
        sentences=numpy.asarray([parse(x,tags=False,chunks=False).split(" ") for x in self.__sent_tokenizer.tokenize(document)])
        vecs=self.__tfidf.transform(self.__vectorizer.transform(sentences))
        return [(self.__preds[x],x) for x in self.__rbfClassifier.predict(vecs)]
    
    def sentTokenize(self,document,parse=False,remPunc=True):
        '''
        Used to tile on sentences using the Brown corpus from ntlk.
        No vectorizers are necessary but the sent_tokenizer needs to 
        be established.
        
        *Required Parameters*
        :param document: full text of document
        
        *Optional Parameters*
        :param parse: whether to use CLIPS pattern to stem and disambiguate the sentence
        :param remPunc: whether to remove punctuation (default is true) [certain algos. such as max ent for sentence detection may require False]
        '''
        print self.__sent_tokenizer.tokenize(document)
        sentences=[(lambda x:parse(x,tags=False,chunks=False).split(" ") if parse is True else x)(x) for x in self.__sent_tokenizer.tokenize(document)]
        return sentences
    
    def textTiler(self,document,parse=False):
        '''
        Tile Text for further Processing. Separation by topic is recommended before identifying what that topic is.
        Even better results can be obtained with SimplrTerms feature folder but a tool like that can take a while.
        
        *Required Parameters*
        :param document: The Document to Tile
        
        *Optional Parameters*
        :param parse: whether to stem and disambiguate sentences in the document using pattern clips
        '''
        if parse is True:
            document="\n".join(self.sentTokenize(document, parse,remPunc=False))
        return self.__textTiler.tokenize(document)
Beispiel #16
0
    tt = TextTilingTokenizer()
    # tt_demo = TextTilingTokenizer(demo_mode = True)

    # get all conversation IDs
    sql = 'SELECT DISTINCT(convID) FROM entropy'
    cur.execute(sql)
    convIDs = [tup[0] for tup in cur.fetchall()]

    # get text for each cid and do text tiling
    for cid in convIDs:
        sql = 'SELECT tagged FROM entropy WHERE convID = %d' % cid
        cur.execute(sql)
        text = '\n\n\n\t'.join([tup[0] for tup in cur.fetchall()])
        # tiling
        try:
            segmented_text = tt.tokenize(text)
        except Exception, e:
            pass
        else:
            global_id = 0
            for i, seg in enumerate(segmented_text):
                tile_id = i + 1
                sents = [s for s in seg.split('\n\n\n\t') if s != '']
                for j, s in enumerate(sents):
                    in_tile_id = j + 1
                    global_id += 1
                    # obtain the entropy
                    sql = 'SELECT ent FROM entropy WHERE convID = %d AND globalID = %d' % (cid, global_id)
                    cur.execute(sql)
                    ent = cur.fetchone()[0]
                    # insert to textTiling table
Beispiel #17
0
import codecs
from argparse import ArgumentParser
import os

argparser = ArgumentParser()
argparser.add_argument("file", help="text document")
args = argparser.parse_args()


stopwords = stopwords.words("english")

doc_path = os.path.splitext(args.file)[0]

tt = TextTilingTokenizer()
text = codecs.open(doc_path + ".txt", "r", "utf-8").read()
parags = tt.tokenize(text)


buffer_tiled = ""
buffer_tiled_tagged = ""
buffer_tiled_tagged_clean = ""

tagged_parags = pos_tag_sents([word_tokenize(p) for p in parags])
clean_parags = [filter(lambda taggedword: taggedword[0] not in stopwords, p) for p in tagged_parags]

for i, p in enumerate(parags):
    buffer_tiled += p

    for word, tag in tagged_parags[i]:
        buffer_tiled_tagged += word + "/" + tag + " "
        if word not in stopwords:
Beispiel #18
0
class TexttileWrapper:
    def __init__(self):
        self._tt = TextTilingTokenizer()

    def sentence_array_texttile(self, sentences):
        text = "  \n\n".join(x for x in sentences if len(x) > 0) + "\n\n"
        tok = self._tt.tokenize(text)

        assignments = [0] * len(sentences)

        if tok:
            for ii in xrange(len(sentences)):
                try:
                    assignments[ii] = min(x for x in xrange(len(tok))
                                          if sentences[ii] in tok[x]) + 1
                except ValueError:
                    print("ERROR %i!" % ii)
                    #print(text.encode("ascii", "ignore"))
                    #print(tok)
                    assignments[ii] = 0

        print "**************"
        print assignments

        # Make assignments monotonically increasing
        last_assignment = -1
        assignments_seen = -1
        for ii in xrange(len(assignments)):
            if assignments[ii] != last_assignment:
                assignments_seen += 1
            last_assignment = assignments[ii]
            assignments[ii] = assignments_seen
        print assignments

        return assignments

    def fallback_segmenter(self,
                           text,
                           max_sentence_length=500,
                           arbitrary_words_per_sent=30,
                           max_sentences_per_texttile=15,
                           arbitrary_sentences_per_tile=6):
        # First, try to segment into sentences with punkt
        sentences = punkt.tokenize(text)

        # If that doesn't work, use a really stupid regexp
        longest_sentence = max(len(x) for x in sentences)
        print("Longest sentence is %i" % longest_sentence)
        if longest_sentence > 500:
            print "Using regexp sentence breaker"
            sentences = punct_regexp.findall(text)

        # If that still doesn't work, use arbitrary breaks
        if max(len(x) for x in sentences) > 600:
            print "Using ad hoc sentence breaker"
            sentences = []
            words = text.split()
            num_words = len(words)
            for ii in xrange(num_words // arbitrary_words_per_sent + 1):
                sentences.append(" ".join(
                    words[ii * arbitrary_words_per_sent:min(
                        (ii + 1) * arbitrary_words_per_sent, num_words)]))

        # Now feed that into texttile
        print(sentences)
        try:
            tile_assignments = self.sentence_array_texttile(sentences)
            tiles = set(tile_assignments)
        except ValueError:
            tile_assignments = None

        # If that doesn't work, split "sentences", however defined, into reasonable
        # sized chunks
        if tile_assignments == None or max(
                sum(1 for y in tile_assignments if y == x)
                for x in tiles) > max_sentences_per_texttile:
            tile_assignments = [
                x // arbitrary_sentences_per_tile
                for x in xrange(len(sentences))
            ]

        return sentences, tile_assignments

    def fallback_wrapper(self, text):
        sentences, assignments = self.fallback_segmenter(text)

        num_sents = len(sentences)
        tiles = []
        for ii in xrange(max(assignments) + 1):
            tiles.append(" ".join(sentences[x] for x in xrange(num_sents) \
                                    if assignments[x] == ii))
        return tiles