Ejemplo n.º 1
0
def run():
    for idx, filename in enumerate(os.listdir(os.getcwd() + '/papers')):
        paper_path = os.getcwd() + '/papers/' + filename

        content = (convert_pdf_to_txt(paper_path))\
            .lower()\
            .replace('.   ', '. ')\
            .replace('.  ', '. ')\
            # .replace('\n\n', '\n')\

        # .replace('\f', '')
        # .replace('\n', ' ')\
        # .replace('-', ' ')\

        relevant_text = get_intro_conclusion(content)
        raw_docs[idx] = relevant_text

        relevant_text = relevant_text\
            .replace('.   ', '. ')\
            .replace('.  ', '. ')\
            .replace('- ', '')

        # Utilising NLTK Text Tiling with default params
        # seg_2 = TextTilingTokenizer().tokenize(relevant_text)

        # Utilising NLTK Text Tiling with custom params(pseudosentence size, block comparison size)
        tt = TextTilingTokenizer(w=10, k=4)
        paper_tiles = tt.tokenize(relevant_text)

        text_tiles[idx] = paper_tiles

    return raw_docs, text_tiles
Ejemplo n.º 2
0
def vis_tokenize(context, question):

    glove = utils.load_glove(dim=200)

    ttt = TextTilingTokenizer()

    para_list = []
    paras = [para for para in context.split('\\n') if para != '']
    for para in paras:
        sent_list = []
        for sent in sent_tokenize(para):
            temp = {}
            temp['words'] = word_tokenize(sent)
            temp['vectors'] = [
                np.array(glove[word.lower()]) for word in temp['words']
            ]
            sent_list.append(temp)
        para_list.append(sent_list)

    q_dict = {}
    q_dict['words'] = word_tokenize(question)
    q_dict['vectors'] = [
        np.array(glove[word.lower()]) for word in q_dict['words']
    ]
    return para_list, q_dict
Ejemplo n.º 3
0
def texttiling():
    conn = db_conn('map')
    cur = conn.cursor()
    tt = TextTilingTokenizer()
    # select all unique observation
    sql = 'SELECT DISTINCT(observation) FROM utterances'
    cur.execute(sql)
    unique_observs = [t[0] for t in cur.fetchall()]
    # for each obsv
    for i, obsv in enumerate(unique_observs):
        sql = 'SELECT utterID, tagged FROM utterances WHERE observation = %s AND tagged <> ""'
        cur.execute(sql, [obsv])
        utter_id, tagged = zip(*cur.fetchall())
        text = '\n\n\n\t'.join(tagged)
        try:
            segmented_text = tt.tokenize(text)
        except Exception as e:
            raise e
        else:
            uid_idx = 0
            for j, seg in enumerate(segmented_text):
                topic_id = j + 1
                sents = [s for s in seg.split('\n\n\n\t') if s != '']
                for k, s in enumerate(sents):
                    in_topic_id = k + 1
                    sql = 'UPDATE utterances SET topicID = %s, inTopicID = %s \
                        WHERE observation = %s AND utterID = %s'

                    cur.execute(
                        sql, (topic_id, in_topic_id, obsv, utter_id[uid_idx]))
                    uid_idx += 1
                    conn.commit()
            sys.stdout.write('\r{}/{}'.format(i + 1, len(unique_observs)))
            sys.stdout.flush()
Ejemplo n.º 4
0
def texttiling_text(text, k=20, w=40, smoothing_width=10, smoothing_rounds=5):
    tt = TextTilingTokenizer(stopwords=raw_stopword_list,
                             k=k,
                             w=w,
                             smoothing_width=smoothing_width,
                             smoothing_rounds=smoothing_rounds)

    o = tt.tokenize(text)
    return o
Ejemplo n.º 5
0
 def reload_tiler(self):
     """
     Reload the text tiler. Use if memory is an issue.
     """
     del self.__tiler
     self.__tiler = self.__tiler = TextTilingTokenizer(
         stopwords=self.__stop_words,
         cutoff_policy=self.__cutoff_policy,
         w=self.__w,
         k=self.__k)
def split_pp_to_paragraphs(clean_pp, contractions_dict, pattern):
    """
    Uses TextTilingTokenizer to split to paragraphs, the
    privacy policy document should be pre-processed (HTML cleaned) before reaching this function.
    :param clean_pp: clean pp before expansion of contractions and special cases
    :param contractions_dict: a dictionary that includes all varieties of contractions and their expansion
    :param pattern: pattern for the expansion of contractions
    :return: list of paragraphs
    """
    clean_pp = clean_pp_advanced(clean_pp, contractions_dict, pattern)
    ttt = TextTilingTokenizer()
    paragraphs = ttt.tokenize(clean_pp)
    return paragraphs
Ejemplo n.º 7
0
def segments(txt):

    ttt = TextTilingTokenizer()
    tokens = ttt.tokenize(txt)

    start = 0
    end = 0
    tileSpan = []

    for token in tokens:
        end = start + len(token)
        tileSpan.append((start, end))
        start = end
    return tileSpan
Ejemplo n.º 8
0
def demo(text=None):
    from nltk.corpus import brown
    from matplotlib import pylab
    tt = TextTilingTokenizer(demo_mode=True)
    if text is None:
        text = brown.raw()[:10000]
    s, ss, d, b = tt.tokenize(text)
    pylab.xlabel("Sentence Gap index")
    pylab.ylabel("Gap Scores")
    pylab.plot(range(len(s)), s, label="Gap Scores")
    pylab.plot(range(len(ss)), ss, label="Smoothed Gap scores")
    pylab.plot(range(len(d)), d, label="Depth scores")
    pylab.stem(range(len(b)), b)
    pylab.legend()
    pylab.show()
Ejemplo n.º 9
0
 def __init__(self,
              cutoff_policy='HC',
              stop_words=stopwords.words('english'),
              w=20,
              k=10):
     """
     Constructor
     """
     self.__stop_words = stop_words
     self.__cutoff_policy = cutoff_policy
     self.__w = w
     self.__k = k
     self.__tiler = TextTilingTokenizer(stopwords=stop_words,
                                        cutoff_policy=cutoff_policy,
                                        w=w,
                                        k=k)
Ejemplo n.º 10
0
def segment_transcript(doc):
    """doc is a document object with text lines
    in 'transcript',
    add a list of 'topics' to the document object
    and return it
    """

    tok = TextTilingTokenizer()

    lines = [turn['text'] for turn in doc['lines']]
    text = "\n\n".join(lines)

    doc['topics'] = []
    start = 0
    for topic in tok.tokenize(text):
        length = len(topic.strip().split('\n\n'))
        end = start + length
        doc['topics'].append({'start': start, 'end': end})
        start = end

    return doc
Ejemplo n.º 11
0
from nltk.tokenize.texttiling import TextTilingTokenizer
from nltk.tag import pos_tag, pos_tag_sents
from nltk import word_tokenize
import codecs
from argparse import ArgumentParser
import os

argparser = ArgumentParser()
argparser.add_argument('file', help="text document")
args = argparser.parse_args()

stopwords = stopwords.words('english')

doc_path = os.path.splitext(args.file)[0]

tt = TextTilingTokenizer()
text = codecs.open(doc_path + '.txt', 'r', "utf-8").read()
parags = tt.tokenize(text)

buffer_tiled = ''
buffer_tiled_tagged = ''
buffer_tiled_tagged_clean = ''

tagged_parags = pos_tag_sents([word_tokenize(p) for p in parags])
clean_parags = [
    filter(lambda taggedword: taggedword[0] not in stopwords, p)
    for p in tagged_parags
]

for i, p in enumerate(parags):
    buffer_tiled += p
Ejemplo n.º 12
0
 def __init__(self):
     self._tt = TextTilingTokenizer()
Ejemplo n.º 13
0
def get_paragraphs_from_text(text):
    tiling_tokenizer = TextTilingTokenizer()
    paragraphs = tiling_tokenizer.tokenize(text)
    return paragraphs