Esempio n. 1
0
def get_corpus(name):

    corpus = []
    with open(name, 'r') as corpus_file:
        for doc in corpus_file:
            if doc.strip():
                corpus.append(parse_document(doc))
        corpus_file.close()
        return corpus
            for status, chunk in itertools.groupby(
                flattened_chunks, lambda (word, pos, chunk): chunk != 'O')
        ]

        valid_chunks = [
            ' '.join(word.lower() for word, tag, chunk in wtc_group
                     if word.lower() not in stopword_list)
            for status, wtc_group in valid_chunks_tagged if status
        ]

        all_chunks.append(valid_chunks)

    return all_chunks


sentences = parse_document(toy_text)
valid_chunks = get_chunks(sentences)
print valid_chunks


def get_tfidf_weighted_keyphrases(sentences,
                                  grammar=r'NP: {<DT>? <JJ>* <NN.*>+}',
                                  top_n=10):

    valid_chunks = get_chunks(sentences, grammar=grammar)

    dictionary = corpora.Dictionary(valid_chunks)
    corpus = [dictionary.doc2bow(chunk) for chunk in valid_chunks]

    tfidf = models.TfidfModel(corpus)
    corpus_tfidf = tfidf[corpus]
to control their body temperature. Their pillar-like legs can 
carry their great weight. African elephants have larger ears 
and concave backs while Asian elephants have smaller ears 
and convex or level backs.  
"""


from gensim.summarization import summarize, keywords

def text_summarization_gensim(text, summary_ratio=0.5):
    
    summary = summarize(text, split=True, ratio=summary_ratio)
    for sentence in summary:
        print(sentence)

docs = parse_document(toy_text)
text = ' '.join(docs)
text_summarization_gensim(text, summary_ratio=0.4)


    
sentences = parse_document(toy_text)
norm_sentences = normalize_corpus(sentences,lemmatize=False) 

total_sentences = len(norm_sentences)
print('Total Sentences in Document:', total_sentences)

num_sentences = 3
num_topics = 2

vec, dt_matrix = build_feature_matrix(sentences, 
                                             lambda (word,pos,chunk): chunk != 'O')]
        
        valid_chunks = [' '.join(word.lower() 
                                for word, tag, chunk 
                                in wtc_group 
                                    if word.lower() 
                                        not in stopword_list) 
                                    for status, wtc_group 
                                    in valid_chunks_tagged
                                        if status]
                                            
        all_chunks.append(valid_chunks)
    
    return all_chunks
    
sentences = parse_document(toy_text)          
valid_chunks = get_chunks(sentences)
print valid_chunks

def get_tfidf_weighted_keyphrases(sentences, 
                                  grammar=r'NP: {<DT>? <JJ>* <NN.*>+}',
                                  top_n=10):
    
    valid_chunks = get_chunks(sentences, grammar=grammar)
                                     
    dictionary = corpora.Dictionary(valid_chunks)
    corpus = [dictionary.doc2bow(chunk) for chunk in valid_chunks]
    
    tfidf = models.TfidfModel(corpus)
    corpus_tfidf = tfidf[corpus]
    
for moving objects and digging. Elephants' large ear flaps help 
to control their body temperature. Their pillar-like legs can 
carry their great weight. African elephants have larger ears 
and concave backs while Asian elephants have smaller ears 
and convex or level backs.  
"""


def text_summarization_gensim(text, summary_ratio=0.5):
    summary = summarize(text, split=True, ratio=summary_ratio)
    for sentence in summary:
        print(sentence)


# Using Gensim Summarization Method
docs = parse_document(document1)
text = ' '.join(docs)
text_summarization_gensim(text, summary_ratio=0.3)

sentences = parse_document(document1)
norm_sentences = normalize_corpus(sentences, lemmatize=False)

total_sentences = len(norm_sentences)
print('Total Sentences in Document:', total_sentences)

num_sentences = 3
num_topics = 1

vec, dt_matrix = build_feature_matrix(sentences, feature_type='frequency')

td_matrix = dt_matrix.transpose()
Esempio n. 6
0
    top_sentence_indices.sort()
    s = ''
    for index in top_sentence_indices:
        s = s + ' ' + sentences[index]
        print(sentences[index])
    return s


path = r'../../data/raw/OpinosisDataset1.0_0/topics/'
allFiles = glob.glob(path + "/*.data")
reviews = list()
for file_ in allFiles:
    with open(file_, "r") as f:
        review = f.read()
        DOCUMENT = review
        sentences = parse_document(DOCUMENT)
        norm_sentences = normalize_corpus(sentences, lemmatize=True)
        print("Total Sentences:", len(norm_sentences))
        filename_search = re.search(r'[^\\/:*?"<>|\r\n]+$', file_)
        filename = filename_search.group()
        myfile = open(r'../../data/processed/lsa/' + filename, 'w')
        myfile.writelines(
            lsa_text_summarizer(norm_sentences,
                                num_sentences=2,
                                num_topics=5,
                                feature_type='frequency',
                                sv_threshold=0.5))
        myfile = open(r'../../data/processed/textrank_cosine/' + filename, 'w')
        myfile.writelines(
            textrank_text_summarizer(norm_sentences,
                                     num_sentences=2,
to control their body temperature. Their pillar-like legs can 
carry their great weight. African elephants have larger ears 
and concave backs while Asian elephants have smaller ears 
and convex or level backs.  
"""


from gensim.summarization import summarize, keywords

def text_summarization_gensim(text, summary_ratio=0.5):
    
    summary = summarize(text, split=True, ratio=summary_ratio)
    for sentence in summary:
        print sentence

docs = parse_document(toy_text)
text = ' '.join(docs)
text_summarization_gensim(text, summary_ratio=0.4)


    
sentences = parse_document(toy_text)
norm_sentences = normalize_corpus(sentences,lemmatize=False) 

total_sentences = len(norm_sentences)
print 'Total Sentences in Document:', total_sentences   



num_sentences = 3
num_topics = 2