Ejemplo n.º 1
0
def texttiling():
    conn = db_conn("map")
    cur = conn.cursor()
    tt = TextTilingTokenizer()
    # select all unique observation
    sql = "SELECT DISTINCT(observation) FROM utterances"
    cur.execute(sql)
    unique_observs = [t[0] for t in cur.fetchall()]
    # for each obsv
    for i, obsv in enumerate(unique_observs):
        sql = 'SELECT utterID, tagged FROM utterances WHERE observation = %s AND tagged <> ""'
        cur.execute(sql, [obsv])
        utter_id, tagged = zip(*cur.fetchall())
        text = "\n\n\n\t".join(tagged)
        try:
            segmented_text = tt.tokenize(text)
        except Exception as e:
            raise e
        else:
            uid_idx = 0
            for j, seg in enumerate(segmented_text):
                topic_id = j + 1
                sents = [s for s in seg.split("\n\n\n\t") if s != ""]
                for k, s in enumerate(sents):
                    in_topic_id = k + 1
                    sql = "UPDATE utterances SET topicID = %s, inTopicID = %s \
                        WHERE observation = %s AND utterID = %s"
                    cur.execute(sql, (topic_id, in_topic_id, obsv, utter_id[uid_idx]))
                    uid_idx += 1
                    conn.commit()
            sys.stdout.write("\r{}/{}".format(i + 1, len(unique_observs)))
            sys.stdout.flush()
Ejemplo n.º 2
0
def texttiling():
    conn = db_conn('map')
    cur = conn.cursor()
    tt = TextTilingTokenizer()
    # select all unique observation
    sql = 'SELECT DISTINCT(observation) FROM utterances'
    cur.execute(sql)
    unique_observs = [t[0] for t in cur.fetchall()]
    # for each obsv
    for i, obsv in enumerate(unique_observs):
        sql = 'SELECT utterID, tagged FROM utterances WHERE observation = %s AND tagged <> ""'
        cur.execute(sql, [obsv])
        utter_id, tagged = zip(*cur.fetchall())
        text = '\n\n\n\t'.join(tagged)
        try:
            segmented_text = tt.tokenize(text)
        except Exception as e:
            raise e
        else:
            uid_idx = 0
            for j, seg in enumerate(segmented_text):
                topic_id = j + 1
                sents = [s for s in seg.split('\n\n\n\t') if s != '']
                for k, s in enumerate(sents):
                    in_topic_id = k + 1
                    sql = 'UPDATE utterances SET topicID = %s, inTopicID = %s \
                        WHERE observation = %s AND utterID = %s'

                    cur.execute(
                        sql, (topic_id, in_topic_id, obsv, utter_id[uid_idx]))
                    uid_idx += 1
                    conn.commit()
            sys.stdout.write('\r{}/{}'.format(i + 1, len(unique_observs)))
            sys.stdout.flush()
Ejemplo n.º 3
0
def texttiling_text(text, k=20, w=40, smoothing_width=10, smoothing_rounds=5):
    tt = TextTilingTokenizer(stopwords=raw_stopword_list,
                             k=k,
                             w=w,
                             smoothing_width=smoothing_width,
                             smoothing_rounds=smoothing_rounds)

    o = tt.tokenize(text)
    return o
def split_pp_to_paragraphs(clean_pp, contractions_dict, pattern):
    """
    Uses TextTilingTokenizer to split to paragraphs, the
    privacy policy document should be pre-processed (HTML cleaned) before reaching this function.
    :param clean_pp: clean pp before expansion of contractions and special cases
    :param contractions_dict: a dictionary that includes all varieties of contractions and their expansion
    :param pattern: pattern for the expansion of contractions
    :return: list of paragraphs
    """
    clean_pp = clean_pp_advanced(clean_pp, contractions_dict, pattern)
    ttt = TextTilingTokenizer()
    paragraphs = ttt.tokenize(clean_pp)
    return paragraphs
Ejemplo n.º 5
0
def segments(txt):

    ttt = TextTilingTokenizer()
    tokens = ttt.tokenize(txt)

    start = 0
    end = 0
    tileSpan = []

    for token in tokens:
        end = start + len(token)
        tileSpan.append((start, end))
        start = end
    return tileSpan
Ejemplo n.º 6
0
def segments(txt):
    
    ttt = TextTilingTokenizer()
    tokens = ttt.tokenize(txt)
    
    start = 0
    end = 0
    tileSpan = []
    
    for token in tokens:
        end = start + len(token)
        tileSpan.append((start, end))
        start = end
    return tileSpan
Ejemplo n.º 7
0
def demo(text=None):
    from nltk.corpus import brown
    from matplotlib import pylab
    tt = TextTilingTokenizer(demo_mode=True)
    if text is None:
        text = brown.raw()[:10000]
    s, ss, d, b = tt.tokenize(text)
    pylab.xlabel("Sentence Gap index")
    pylab.ylabel("Gap Scores")
    pylab.plot(range(len(s)), s, label="Gap Scores")
    pylab.plot(range(len(ss)), ss, label="Smoothed Gap scores")
    pylab.plot(range(len(d)), d, label="Depth scores")
    pylab.stem(range(len(b)), b)
    pylab.legend()
    pylab.show()
Ejemplo n.º 8
0
def vis_tokenize(context, question):

    glove = utils.load_glove(dim=200)

    ttt = TextTilingTokenizer()

    para_list = []
    paras = [para for para in context.split('\\n') if para != '']
    for para in paras:
        sent_list = []
        for sent in sent_tokenize(para):
            temp = {}
            temp['words'] = word_tokenize(sent)
            temp['vectors'] = [
                np.array(glove[word.lower()]) for word in temp['words']
            ]
            sent_list.append(temp)
        para_list.append(sent_list)

    q_dict = {}
    q_dict['words'] = word_tokenize(question)
    q_dict['vectors'] = [
        np.array(glove[word.lower()]) for word in q_dict['words']
    ]
    return para_list, q_dict
Ejemplo n.º 9
0
def run():
    for idx, filename in enumerate(os.listdir(os.getcwd() + '/papers')):
        paper_path = os.getcwd() + '/papers/' + filename

        content = (convert_pdf_to_txt(paper_path))\
            .lower()\
            .replace('.   ', '. ')\
            .replace('.  ', '. ')\
            # .replace('\n\n', '\n')\

        # .replace('\f', '')
        # .replace('\n', ' ')\
        # .replace('-', ' ')\

        relevant_text = get_intro_conclusion(content)
        raw_docs[idx] = relevant_text

        relevant_text = relevant_text\
            .replace('.   ', '. ')\
            .replace('.  ', '. ')\
            .replace('- ', '')

        # Utilising NLTK Text Tiling with default params
        # seg_2 = TextTilingTokenizer().tokenize(relevant_text)

        # Utilising NLTK Text Tiling with custom params(pseudosentence size, block comparison size)
        tt = TextTilingTokenizer(w=10, k=4)
        paper_tiles = tt.tokenize(relevant_text)

        text_tiles[idx] = paper_tiles

    return raw_docs, text_tiles
Ejemplo n.º 10
0
 def __init__(self,
              cutoff_policy='HC',
              stop_words=stopwords.words('english'),
              w=20,
              k=10):
     """
     Constructor
     """
     self.__stop_words = stop_words
     self.__cutoff_policy = cutoff_policy
     self.__w = w
     self.__k = k
     self.__tiler = TextTilingTokenizer(stopwords=stop_words,
                                        cutoff_policy=cutoff_policy,
                                        w=w,
                                        k=k)
Ejemplo n.º 11
0
 def reload_tiler(self):
     """
     Reload the text tiler. Use if memory is an issue.
     """
     del self.__tiler
     self.__tiler = self.__tiler = TextTilingTokenizer(
         stopwords=self.__stop_words,
         cutoff_policy=self.__cutoff_policy,
         w=self.__w,
         k=self.__k)
Ejemplo n.º 12
0
def segment_transcript(doc):
    """doc is a document object with text lines
    in 'transcript',
    add a list of 'topics' to the document object
    and return it
    """

    tok = TextTilingTokenizer()

    lines = [turn['text'] for turn in doc['lines']]
    text = "\n\n".join(lines)

    doc['topics'] = []
    start = 0
    for topic in tok.tokenize(text):
        length = len(topic.strip().split('\n\n'))
        end = start + length
        doc['topics'].append({'start': start, 'end': end})
        start = end

    return doc
Ejemplo n.º 13
0
def texttiling_BNC():
    conn = db_conn('bnc')
    cur = conn.cursor()
    # select unique convId
    query = 'select distinct(convId) from entropy_DEM_full'
    cur.execute(query)
    conv_ids = [t[0] for t in cur.fetchall()]

    # for each convId, do texttiling, and update the episodeId and inEpisodeId columns
    tt = TextTilingTokenizer()
    for i, cid in enumerate(conv_ids):
        query = 'select strLower from entropy_DEM_full where convId = %s'
        cur.execute(query, [cid])
        text = '\n\n\n\t'.join([t[0] for t in cur.fetchall()])
        try:
            segmented = tt.tokenize(text)
        except Exception as e:
            exc_type, exc_obj, exc_tb = sys.exc_info()
            if str(exc_obj) == 'Input vector needs to be bigger than window size.' or \
                str(exc_obj) == 'No paragraph breaks were found(text too short perhaps?)': # it means the conversation is too short
                pass
            else:
                raise
        else:
            global_id = 1
            for j, seg in enumerate(segmented):
                epi_id = j + 1
                sents = [s for s in seg.split('\n\n\n\t') if s != '']
                for k, s in enumerate(sents):
                    in_epi_id = k + 1
                    # update
                    query = 'update entropy_DEM_full set episodeId = %s, inEpisodeId = %s \
                        where convId = %s and globalId = %s'
                    cur.execute(query, (epi_id, in_epi_id, cid, global_id))
                    global_id += 1
            # print progress
            sys.stdout.write('\r%s/%s updated' % (i+1, len(conv_ids)))
            sys.stdout.flush()
    # commit
    conn.commit()
Ejemplo n.º 14
0
 def buildClassifier(self,classifier):
     '''
     Instantiates each of the Classifiers. Vectorizers
     should be built separately
     
     
     *Required Parameters*
     :param classifier: specify which type of classifier to build (mnb,rbf,sent,textTile)
     '''
     
     if classifier == 'mnb':
         self.__mnbClassifier=MultinomialNB()
     elif classifier == 'rbf':
         self.__rbfClassifier=SVC()
     elif classifier=='sent':
         self.__sent_tokenizer=nltk.data.load('tokenizers/punkt/english.pickle')
     elif classifier == 'textTile':
         self.__textTiler=TextTilingTokenizer()
     elif classifier=='clust':
         self.__clustMatrix=None
Ejemplo n.º 15
0
class TopicTokenizer:
    """
    Text tiling tokenizer
    """
    def __init__(self,
                 cutoff_policy='HC',
                 stop_words=stopwords.words('english'),
                 w=20,
                 k=10):
        """
        Constructor
        """
        self.__stop_words = stop_words
        self.__cutoff_policy = cutoff_policy
        self.__w = w
        self.__k = k
        self.__tiler = TextTilingTokenizer(stopwords=stop_words,
                                           cutoff_policy=cutoff_policy,
                                           w=w,
                                           k=k)

    def get_boundaries(self, text):
        """
        Get potential topic boundaries between the text.

        :param text:    The text to tile
        :return:    A list of potential topics
        """
        topics = self.__tiler.tokenize(text)
        return topics

    def reload_tiler(self):
        """
        Reload the text tiler. Use if memory is an issue.
        """
        del self.__tiler
        self.__tiler = self.__tiler = TextTilingTokenizer(
            stopwords=self.__stop_words,
            cutoff_policy=self.__cutoff_policy,
            w=self.__w,
            k=self.__k)
Ejemplo n.º 16
0
    def __init__(self, w=200, k=5):

        #call super class constructor
        TextTilingTokenizer.__init__(self, w, k, stopwords=nltkstop.words(LANG), demo_mode=True)
Ejemplo n.º 17
0
 def __init__(self):
     self._tt = TextTilingTokenizer()
Ejemplo n.º 18
0
class TexttileWrapper:
    def __init__(self):
        self._tt = TextTilingTokenizer()

    def sentence_array_texttile(self, sentences):
        text = "  \n\n".join(x for x in sentences if len(x) > 0) + "\n\n"
        tok = self._tt.tokenize(text)

        assignments = [0] * len(sentences)

        if tok:
            for ii in xrange(len(sentences)):
                try:
                    assignments[ii] = min(x for x in xrange(len(tok)) if sentences[ii] in tok[x]) + 1
                except ValueError:
                    print("ERROR %i!" % ii)
                    #print(text.encode("ascii", "ignore"))
                    #print(tok)
                    assignments[ii] = 0

        print "**************"
        print assignments

        # Make assignments monotonically increasing
        last_assignment = -1
        assignments_seen = -1
        for ii in xrange(len(assignments)):
            if assignments[ii] != last_assignment:
                assignments_seen += 1
            last_assignment = assignments[ii]
            assignments[ii] = assignments_seen
        print assignments

        return assignments


    def fallback_segmenter(self, text, max_sentence_length = 500,
                           arbitrary_words_per_sent = 30,
                           max_sentences_per_texttile = 15,
                           arbitrary_sentences_per_tile = 6):
        # First, try to segment into sentences with punkt
        sentences = punkt.tokenize(text)

        # If that doesn't work, use a really stupid regexp
        longest_sentence = max(len(x) for x in sentences)
        print ("Longest sentence is %i" % longest_sentence)
        if longest_sentence > 500:
            print "Using regexp sentence breaker"
            sentences = punct_regexp.findall(text)

        # If that still doesn't work, use arbitrary breaks
        if max(len(x) for x in sentences) > 600:
            print "Using ad hoc sentence breaker"
            sentences = []
            words = text.split()
            num_words = len(words)
            for ii in xrange(num_words // arbitrary_words_per_sent + 1):
                sentences.append(" ".join(words[ii * arbitrary_words_per_sent:
                                                min((ii + 1) * arbitrary_words_per_sent,
                                                    num_words)]))

        # Now feed that into texttile
        print(sentences)
        try:
            tile_assignments = self.sentence_array_texttile(sentences)
            tiles = set(tile_assignments)
        except ValueError:
            tile_assignments = None

        # If that doesn't work, split "sentences", however defined, into reasonable
        # sized chunks
        if tile_assignments == None or max(sum(1 for y in tile_assignments if y == x) for x in tiles) > max_sentences_per_texttile:
            tile_assignments = [x // arbitrary_sentences_per_tile for x in xrange(len(sentences))]

        return sentences, tile_assignments

    def fallback_wrapper(self, text):
        sentences, assignments = self.fallback_segmenter(text)

        num_sents = len(sentences)
        tiles = []
        for ii in xrange(max(assignments) + 1):
            tiles.append(" ".join(sentences[x] for x in xrange(num_sents) \
                                    if assignments[x] == ii))
        return tiles
Ejemplo n.º 19
0
def get_paragraphs_from_text(text):
    tiling_tokenizer = TextTilingTokenizer()
    paragraphs = tiling_tokenizer.tokenize(text)
    return paragraphs
Ejemplo n.º 20
0
from nltk.tokenize.texttiling import TextTilingTokenizer
from nltk.tag import pos_tag, pos_tag_sents
from nltk import word_tokenize
import codecs
from argparse import ArgumentParser
import os

argparser = ArgumentParser()
argparser.add_argument('file', help="text document")
args = argparser.parse_args()

stopwords = stopwords.words('english')

doc_path = os.path.splitext(args.file)[0]

tt = TextTilingTokenizer()
text = codecs.open(doc_path + '.txt', 'r', "utf-8").read()
parags = tt.tokenize(text)

buffer_tiled = ''
buffer_tiled_tagged = ''
buffer_tiled_tagged_clean = ''

tagged_parags = pos_tag_sents([word_tokenize(p) for p in parags])
clean_parags = [
    filter(lambda taggedword: taggedword[0] not in stopwords, p)
    for p in tagged_parags
]

for i, p in enumerate(parags):
    buffer_tiled += p
Ejemplo n.º 21
0
    # db init: ssh [email protected] -i ~/.ssh/id_rsa -L 1234:localhost:3306
    conn = MySQLdb.connect(host="127.0.0.1",
                           user="******",
                           port=3306,
                           passwd="05012014",
                           db="bnc")
    cur = conn.cursor()

    # all convIDs
    sql = 'SELECT DISTINCT(convID) FROM entropy_DEM100'
    cur.execute(sql)
    convIDs = [tup[0] for tup in cur.fetchall()]
    convIDs.sort()

    # tokenizer
    tt = TextTilingTokenizer()

    # get the text of each convID, and do the TextTiling
    failed_convIDs = []
    for cid in convIDs:
        sql = 'SELECT rawWord FROM entropy_DEM100 WHERE convID = %s'
        cur.execute(sql, [cid])
        text = '\n\n\n\t'.join([tup[0] for tup in cur.fetchall()])

        try:
            segmented_text = tt.tokenize(text)
        except Exception, e:
            print 'convID %d failed' % cid
            failed_convIDs.append(cid)
        else:
            global_idx = 1
Ejemplo n.º 22
0
                    user = "******", 
                    port = 1234,
                    passwd = "05012014",
                    db = "swbd")
    cur = conn.cursor()

    # create the table
    sql = 'DROP TABLE IF EXISTS textTiling'
    cur.execute(sql)
    sql = 'CREATE TABLE textTiling (convID INT, globalID INT, tileID INT, inTileID INT, entropy FLOAT, \
        PRIMARY KEY (convID, globalID));'
    cur.execute(sql)


    # initialize
    tt = TextTilingTokenizer()
    # tt_demo = TextTilingTokenizer(demo_mode = True)

    # get all conversation IDs
    sql = 'SELECT DISTINCT(convID) FROM entropy'
    cur.execute(sql)
    convIDs = [tup[0] for tup in cur.fetchall()]

    # get text for each cid and do text tiling
    for cid in convIDs:
        sql = 'SELECT tagged FROM entropy WHERE convID = %d' % cid
        cur.execute(sql)
        text = '\n\n\n\t'.join([tup[0] for tup in cur.fetchall()])
        # tiling
        try:
            segmented_text = tt.tokenize(text)
Ejemplo n.º 23
0
class TextHandler(object):
    '''
    Parse Out Individual Pieces of Text. 
    Tools utilizable from this API class include:
      -Text Tiling
      -RBF Networks
      -Fuzzy Clustering with Cosine Distances (custom and open source implementation of mine)
      -MNB recognition
      -sent tokenizing
      -minimum distance matching
    '''

    def __init__(self):
        '''
        Constructor
        '''
        self.__mnbClassifier=None
        self.__rbfClassifier=None
        self.__textTiler=None
        self.__vectorizer=None
        self.__tfidf=None
        self.__mnbTrained=False
        self.__clustMatrix=None
        self.__preds=[]
    
    
    def delMNB(self):
        '''
        GC Object
        '''
        self.__mnbClassifier=None
        self.__mnb=False
        gc.collect()
        del gc.garbage[:]
    
    def delRBF(self):
        '''
        GC Object
        '''
        self.__rbfClassifier=None
        gc.collect()
        del gc.garbage[:]
    

    
    def delTextTiler(self):
        '''
        GC Object
        '''
        self.__textTiler=None
        gc.collect()
        del gc.garbage[:]
    
    def delVectorizer(self):
        '''
        GC Object
        '''
        self.__vectorizer=None
        gc.collect()
        del gc.garbage[:]
    
    def delTfIdf(self):
        '''
        GC Object
        '''
        self.__tfidf=None
        gc.collect()
        del gc.garbage[:]
    
    def delClustMatrix(self):
        '''
        GC Object
        '''
        self.__clustMatrix=None
        gc.collect()
        del gc.garbage[:]
    
    
    def resetPreds(self):
        '''
        GC Object
        '''
        self.__preds=[]
        gc.collect()
        del gc.garbage[:]
    
    def getPreds(self):
        '''
        Return a list of the current prediction names.
        '''
        return self.__preds
    
    def getClustMatrix(self):
        '''
        Returns the cluster matrix
        '''
        return self.__clustMatrix
    
    
    def getNamedEntities(self):
        '''
        Parse out named entities
        '''
        pass
    
    def buildVectorizer(self,vector):
        '''
        Instantiates the vectorizers such as count vectorizer
        or tfidf vectorizer. This is useful when calling the vectorizer
        multiple times.
        
        *Required Parameters*
        :param vector: which vectorizer to build ('count','tfidf')
        
        '''
        if vector == 'count':
            self.__vectorizer=CountVectorizer(stop_words='english')
        elif vector == 'tfidf':
            self.__tfidf=TfidfTransformer(norm='l2')
    
    def trainVectorizers(self,document):
        '''
        Train the Vectorizers with a document that should be tokenized into sentences and words
        
        **Warning: All listed items will be concatenated to a single matrix**
        
        *Required Parameters*
        :param document: the document (text) or list of documents (file paths) to build count and tfidf vectorizers with (be as representative as possible)
        '''
        
        self.buildVectorizer('count')
        self.buildVectorizer('tfidf')
        
        if type(document) is str:
            self.__tfidf.fit(self.__vectorizer.fit_transform(document))
        else:
            uvecs=None
            sentences=[]
            for doc in document:
                if os.path.exists(doc) is True:
                    sentences=[]
                    with open(document,'r') as fp:
                        sentences.extend([parse(x,tags=False,chunks=False).split(" ") for x in self.__sent_tokenizer.tokenize(fp.read())])
        
            if uvecs is not None:
                self.__tfidf.fit(self.__vectorizer.fit(sentences))
                    
    def buildClassifier(self,classifier):
        '''
        Instantiates each of the Classifiers. Vectorizers
        should be built separately
        
        
        *Required Parameters*
        :param classifier: specify which type of classifier to build (mnb,rbf,sent,textTile)
        '''
        
        if classifier == 'mnb':
            self.__mnbClassifier=MultinomialNB()
        elif classifier == 'rbf':
            self.__rbfClassifier=SVC()
        elif classifier=='sent':
            self.__sent_tokenizer=nltk.data.load('tokenizers/punkt/english.pickle')
        elif classifier == 'textTile':
            self.__textTiler=TextTilingTokenizer()
        elif classifier=='clust':
            self.__clustMatrix=None
    
    def trainMinDistanceMat(self,cats):
        '''
        Train the Minmum Distance Cluster Matrix from given files
        **WARNING: This will turn a sparse vector to a dense vector**
        
        *Required Parameters*
        :param cats: categories to use
        '''
        for cat in cats:
            if os.path.exists(cat) is True:
                sentences=[]
                with open(cat,'r') as fp:
                    sentences=[parse(x,tags=False,chunks=False).split(" ") for x in self.__sent_tokenizer.tokenize(fp.read())]
                
                if len(sentences)>0:
                    vecs=self.__vectorizer.transform(sentences)
                    vecs=self.__tfidf.transform(vecs)
                    
                    if self.__clustMatrix is None:
                        self.__clustMatrix=vecs.mean()
                    else:
                        self.__clustMatrix=scipy.sparse.vstack((self.__clustMatrix,vecs.mean()))
    
    def getMinDistanceCategory(self,document):
        '''
        Find the best strength document via the trained cats to the document.
        
        Requires building the clustMatrix and vectorizers.
        
        **WARNING: This will turn a sparse vector to a dense vector**
        
        *Required Parameters*
        :param document: document to test with
        '''
        vecs=self.__tfidf.transform(self.__)
        
    
    def trainMNB(self,cats,partial=False):
        '''
        Train Multinomial Bayes to use in obtaining the appropriate weights for data. Please instantiate the sent_tokenizer
        
        *Required Parameters*
        :param cats: list of category directories
        
        *Optional Parameters*
        :param partial: whether to create a partial fit from the data (if using partial, please train the vectorizers first)
        ''' 
        self.__preds=[]
        gc.collect()
        del gc.garbage[:]
        
        if self.__vectorizer is None:
            self.buildVectorizer('count')
        if self.__tfidf is None:
            self.buildClassifier('tfidf')
        
        cl=[]

        uvecs=None
        for cat in cats:
            if os.path.exists(cat) is True:
                sentences=[]
                with open(cat,'r') as fp:
                    sentences=[parse(x,tags=False,chunks=False).split(" ") for x in self.__sent_tokenizer.tokenize(fp.read())]
                
                if len(sentences)>0:
                    vecs=self.__vectorizer.transform(sentences)
                    vecs=self.__tfidf.transform(vecs)
                    
                    if partial is True:
                        self.__preds.append(re.sub("\..*|\/","",os.path.split(cat)[1]).strip())
                        
                        for i in range(vecs.shape[0]):
                            cl.append(len(self.__preds)-1)
                        self.__mnbClassifier.partial_fit(vecs,numpy.asarray(cl))
                        cl=[]
                    else:
                        self.__preds.append(re.sub("\..*|\/","",os.path.split(cat)[1]).strip())
                        for i in range(vecs.shape[0]):
                            cl.append(len(self.__preds)-1)
                        
                        if uvecs is None:
                            uvecs=vecs
                        else:
                            uvecs=scipy.sparse.vstack((uvecs,vecs))
        
        if partial is False and uvecs is not None:
            self.__mnbClassifier.fit(uvecs, cl)
        
        del uvecs
        del cl
        gc.collect()
        del gc.garbage[:]
                        
    def classifyMNB(self,document):
        '''
        Multinomial Bayes Algorithm for fastest but least reliable results. Use only if the topics are clearly distinguishable.
        Requires building vectorizers and training MNB first. Returns the name and number of the category to work with
        
        *Required Parameters*
        :param document: document to classify
        '''
        sentences=numpy.asarray([parse(x,tags=False,chunks=False).split(" ") for x in self.__sent_tokenizer.tokenize(document)])
        vecs=self.__tfidf.transform(self.__vectorizer.transform(sentences))
        return [(self.__preds[x],x) for x in self.__mnbClassifier.predict(vecs)]
    
    def trainRBF(self,cats):
        '''
        Trains an RBF classifier for use in categorization.
        There is no Partial fit for a neural network. Everything must fit in memory.
        
        *Required Parameters*
        :param cats: list of category files to train on
        '''
        
        cl=[]
        uvecs=None
        self.__preds=[]
        gc.collect()
        del gc.garbage[:]
        for cat in cats:
            if os.path.exists(cat) is True:
                sentences=[]
                with open(cat,'r') as fp:
                    sentences=numpy.asarray([parse(x,tags=False,chunks=False).split(" ") for x in self.__sent_tokenizer.tokenize(fp.read())])
                
                if len(sentences)>0:
                    vecs=self.__vectorizer.transform(sentences)
                    vecs=self.__tfidf.transform(vecs)
                    
                    
                    if uvecs is None:
                        uvecs=vecs
                    else:
                        uvecs=scipy.sparse.vstack((uvecs,vecs))
                    self.__preds.append(re.sub("\..*|\/","",os.path.split(cat)[1]).strip())
                    for i in range(vecs.shape[0]):
                        cl.append(len(self.__preds)-1)
        
        if uvecs is not None:
            self.__rbfClassifier.fit(uvecs, cl)
        
        del cl
        del uvecs
        gc.collect()
        del gc.garbage[:]
    
    def classifyRBF(self,document):
        '''
        Classify with RBF Neural Network from SK Learn.
        
        Requires training count and self.__sent_tokenizer.tokenize,tfidf and count vectorizers, and the RBF classifier first
        
        *Required Parameter*
        :param document: text document to use
        '''
        sentences=numpy.asarray([parse(x,tags=False,chunks=False).split(" ") for x in self.__sent_tokenizer.tokenize(document)])
        vecs=self.__tfidf.transform(self.__vectorizer.transform(sentences))
        return [(self.__preds[x],x) for x in self.__rbfClassifier.predict(vecs)]
    
    def sentTokenize(self,document,parse=False,remPunc=True):
        '''
        Used to tile on sentences using the Brown corpus from ntlk.
        No vectorizers are necessary but the sent_tokenizer needs to 
        be established.
        
        *Required Parameters*
        :param document: full text of document
        
        *Optional Parameters*
        :param parse: whether to use CLIPS pattern to stem and disambiguate the sentence
        :param remPunc: whether to remove punctuation (default is true) [certain algos. such as max ent for sentence detection may require False]
        '''
        print self.__sent_tokenizer.tokenize(document)
        sentences=[(lambda x:parse(x,tags=False,chunks=False).split(" ") if parse is True else x)(x) for x in self.__sent_tokenizer.tokenize(document)]
        return sentences
    
    def textTiler(self,document,parse=False):
        '''
        Tile Text for further Processing. Separation by topic is recommended before identifying what that topic is.
        Even better results can be obtained with SimplrTerms feature folder but a tool like that can take a while.
        
        *Required Parameters*
        :param document: The Document to Tile
        
        *Optional Parameters*
        :param parse: whether to stem and disambiguate sentences in the document using pattern clips
        '''
        if parse is True:
            document="\n".join(self.sentTokenize(document, parse,remPunc=False))
        return self.__textTiler.tokenize(document)
Ejemplo n.º 24
0
class TexttileWrapper:
    def __init__(self):
        self._tt = TextTilingTokenizer()

    def sentence_array_texttile(self, sentences):
        text = "  \n\n".join(x for x in sentences if len(x) > 0) + "\n\n"
        tok = self._tt.tokenize(text)

        assignments = [0] * len(sentences)

        if tok:
            for ii in xrange(len(sentences)):
                try:
                    assignments[ii] = min(x for x in xrange(len(tok))
                                          if sentences[ii] in tok[x]) + 1
                except ValueError:
                    print("ERROR %i!" % ii)
                    #print(text.encode("ascii", "ignore"))
                    #print(tok)
                    assignments[ii] = 0

        print "**************"
        print assignments

        # Make assignments monotonically increasing
        last_assignment = -1
        assignments_seen = -1
        for ii in xrange(len(assignments)):
            if assignments[ii] != last_assignment:
                assignments_seen += 1
            last_assignment = assignments[ii]
            assignments[ii] = assignments_seen
        print assignments

        return assignments

    def fallback_segmenter(self,
                           text,
                           max_sentence_length=500,
                           arbitrary_words_per_sent=30,
                           max_sentences_per_texttile=15,
                           arbitrary_sentences_per_tile=6):
        # First, try to segment into sentences with punkt
        sentences = punkt.tokenize(text)

        # If that doesn't work, use a really stupid regexp
        longest_sentence = max(len(x) for x in sentences)
        print("Longest sentence is %i" % longest_sentence)
        if longest_sentence > 500:
            print "Using regexp sentence breaker"
            sentences = punct_regexp.findall(text)

        # If that still doesn't work, use arbitrary breaks
        if max(len(x) for x in sentences) > 600:
            print "Using ad hoc sentence breaker"
            sentences = []
            words = text.split()
            num_words = len(words)
            for ii in xrange(num_words // arbitrary_words_per_sent + 1):
                sentences.append(" ".join(
                    words[ii * arbitrary_words_per_sent:min(
                        (ii + 1) * arbitrary_words_per_sent, num_words)]))

        # Now feed that into texttile
        print(sentences)
        try:
            tile_assignments = self.sentence_array_texttile(sentences)
            tiles = set(tile_assignments)
        except ValueError:
            tile_assignments = None

        # If that doesn't work, split "sentences", however defined, into reasonable
        # sized chunks
        if tile_assignments == None or max(
                sum(1 for y in tile_assignments if y == x)
                for x in tiles) > max_sentences_per_texttile:
            tile_assignments = [
                x // arbitrary_sentences_per_tile
                for x in xrange(len(sentences))
            ]

        return sentences, tile_assignments

    def fallback_wrapper(self, text):
        sentences, assignments = self.fallback_segmenter(text)

        num_sents = len(sentences)
        tiles = []
        for ii in xrange(max(assignments) + 1):
            tiles.append(" ".join(sentences[x] for x in xrange(num_sents) \
                                    if assignments[x] == ii))
        return tiles
Ejemplo n.º 25
0
from nltk.tag import pos_tag, pos_tag_sents
from nltk import word_tokenize
import codecs
from argparse import ArgumentParser
import os

argparser = ArgumentParser()
argparser.add_argument("file", help="text document")
args = argparser.parse_args()


stopwords = stopwords.words("english")

doc_path = os.path.splitext(args.file)[0]

tt = TextTilingTokenizer()
text = codecs.open(doc_path + ".txt", "r", "utf-8").read()
parags = tt.tokenize(text)


buffer_tiled = ""
buffer_tiled_tagged = ""
buffer_tiled_tagged_clean = ""

tagged_parags = pos_tag_sents([word_tokenize(p) for p in parags])
clean_parags = [filter(lambda taggedword: taggedword[0] not in stopwords, p) for p in tagged_parags]

for i, p in enumerate(parags):
    buffer_tiled += p

    for word, tag in tagged_parags[i]:
Ejemplo n.º 26
0
 def __init__(self):
     self._tt = TextTilingTokenizer()