Example #1
0
def store_sentence_lengths(dataset):
    path = '../data/'+dataset
    docs, labels = read_files(path)
    lengths = []
    for doc in docs:
        sentences = preprocess.tokenize_sentences(doc)
        for s in sentences:
            tokens = preprocess.tokenize_tokens(s)
            lengths.append(len(tokens))
    print lengths
    pickle_to_file(lengths, 'output/sentence-lengths/'+dataset)
def _cooccurrence_preprocessing(doc, context, already_preprocessed):
    """Preprocess document as needed for co-occurrence network creation"""
    if context=='window':
        if already_preprocessed:
            doc = doc.split(' ')
        else:
            doc = preprocess.preprocess_text(doc)
    elif context=='sentence':
        doc = preprocess.tokenize_sentences(doc)
        for i, sentence in enumerate(doc):
            sentence = preprocess.preprocess_text(sentence)
            doc[i] = sentence
    return doc
Example #3
0
def _cooccurrence_preprocessing(doc, context, already_preprocessed):
    """Preprocess document as needed for co-occurrence network creation"""
    if context == 'window':
        if already_preprocessed:
            doc = doc.split(' ')
        else:
            doc = preprocess.preprocess_text(doc)
    elif context == 'sentence':
        doc = preprocess.tokenize_sentences(doc)
        for i, sentence in enumerate(doc):
            sentence = preprocess.preprocess_text(sentence)
            doc[i] = sentence
    return doc
Example #4
0
def plot_sentence_lengths(datafile=None):
    """
    Function for plotting histogram of sentence lengths within a given dataset.
    """
    if datafile is None:
        import preprocess
        print '> reading data..'
        path = '../data/tasa/TASA900_text'
        texts, labels = data.read_files(path)
        sentence_lengths = []
        print '> counting lengths..'
        for text in texts:
            sentences = preprocess.tokenize_sentences(text)
            for sentence in sentences:
                tokens = preprocess.tokenize_tokens(sentence)
                sentence_lengths.append(len(tokens))
        data.pickle_to_file(sentence_lengths, 'output/tasa_sentence_lengths.pkl')
    else:
        sentence_lengths = data.pickle_from_file(datafile)
    plotter.histogram(sentence_lengths, 'sentence length (tokens)', '# sentences', bins=70)
Example #5
0
def plot_sentence_lengths(datafile=None):
    """
    Function for plotting histogram of sentence lengths within a given dataset.
    """
    if datafile is None:
        import preprocess
        print '> reading data..'
        path = '../data/tasa/TASA900_text'
        texts, labels = data.read_files(path)
        sentence_lengths = []
        print '> counting lengths..'
        for text in texts:
            sentences = preprocess.tokenize_sentences(text)
            for sentence in sentences:
                tokens = preprocess.tokenize_tokens(sentence)
                sentence_lengths.append(len(tokens))
        data.pickle_to_file(sentence_lengths,
                            'output/tasa_sentence_lengths.pkl')
    else:
        sentence_lengths = data.pickle_from_file(datafile)
    plotter.histogram(sentence_lengths,
                      'sentence length (tokens)',
                      '# sentences',
                      bins=70)
Example #6
0
def summarize(article):
    try:
        _create_unverified_https_context = ssl._create_unverified_context
    except AttributeError:
        pass
    else:
        ssl._create_default_https_context = _create_unverified_https_context

    #nltk.download('averaged_perceptron_tagger')
    sentences = preprocess.tokenize_sentences(article)
    clean_sentences = pdpip3.Series(sentences).str.replace("[^a-zA-Z]", " ")
    clean_sentences = [s.lower() for s in clean_sentences]
    clean_sentences = [
        preprocess.remove_stopwords(r.split()) for r in clean_sentences
    ]

    word_embeddings = {}
    f = open('/Users/apple/Downloads/glove.6B/glove.6B.100d.txt',
             encoding='utf-8')
    for line in f:
        values = line.split()
        word = values[0]
        coefs = np.asarray(values[1:], dtype='float32')
        word_embeddings[word] = coefs
    f.close()

    sentence_vectors = []
    for i in clean_sentences:
        if len(i) != 0:
            v = sum(
                [word_embeddings.get(w, np.zeros((100, )))
                 for w in i.split()]) / (len(i.split()) + 0.001)
        else:
            v = np.zeros((100, ))
        sentence_vectors.append(v)

    sim_mat = np.zeros([len(sentences), len(sentences)])
    for i in range(len(sentences)):
        for j in range(len(sentences)):
            if i != j:
                sim_mat[i][j] = cosine_similarity(
                    sentence_vectors[i].reshape(1, 100),
                    sentence_vectors[j].reshape(1, 100))[0, 0]

    nx_graph = nx.from_numpy_array(sim_mat)
    scores = nx.pagerank(nx_graph)

    ranked_sentences = sorted(
        ((scores[i], s) for i, s in enumerate(sentences)), reverse=True)

    # r = Rake()

    ques = []

    for i in range(len(ranked_sentences)):
        tokens = []
        print(ranked_sentences[i][1])
        article = ranked_sentences[i][1]
        print("Article:", article)
        # r.extract_keywords_from_text(ranked_sentences[i][1])
        # print("*********************")
        # print(r.get_ranked_phrases()) # To get keyword phrases ranked highest to lowest.
        # tokens.extend(r.get_ranked_phrases())
        # lis = []
        # for i in range(len(tokens)):
        #     if len(tokens[i].split()) > 1:
        #         lis.extend(nltk.word_tokenize(tokens[i]))
        #
        #     else:
        #         lis.append(tokens[i])
        # print("Parts of speech tagging: ", pos_tag(lis))
        # for i in range(len(ranked_sentences)):
        doc = nlp(article)
        print("DOC", doc.ents)
        print([(X.text, X.label_) for X in doc.ents])
        for X in doc.ents:
            if X.label_:
                print("Inside for")
                article = article.replace(X.text, "__________")
                ques.append(article)
                break
                #print(ques)
                #print(type(ques))
        print(i + 1, ":", article)

    print(ques)
    return ques
Example #7
0
import argparse