コード例 #1
0
ファイル: p1.py プロジェクト: lydakis/Text-Summarization
    curr += 1

tokenizer = Tokenizer()
model = Model()

# data = tokenizer.tokenize_sentence(corpus)
# clusters = model.fit(data)
# summarizer = Summarizer(clusters)
# import pdb; pdb.set_trace()
# print(summarizer.generate())
for k, v in module.items():
    v = ' '.join(v)
    data = tokenizer.tokenize_sentence(v)
    clusters = model.fit(data)
    summarizer = Summarizer(clusters)
    print(summarizer.generate())

    # print(k)
    # if(re.match(ur'ΚΕΦΑΛΑΙΟ [1-9]ο :',text[i])):
    #   if(text[i] not in chapter):
    #       chapter[text[i]] = 1
    #   else:
    #       j = i+1
    #       para = ""
    #       while not(re.match(ur'ΚΕΦΑΛΑΙΟ [1-9]ο :',text[j])):
    #           if(text[j] == u"Βιβλιογραφικές Αναφορές" or text[j] == u"Βιβλιογραφία" or text[j] == u"ΒΙΒΛΙΟΓΡΑΦΙΚΕΣ ΑΝΑΦΟΡΕΣ"):
    #               break
    #           para += text[j]
    #           j = j+1
    #       subtext[text[i]] = para
コード例 #2
0
ファイル: p3.py プロジェクト: lydakis/Text-Summarization
from model import Model
from tokenizer import Tokenizer
from summarizer import Summarizer
import codecs

#Opening file to read
file_name = sys.argv[1]
#File Name: greekpdf-2.txt
with codecs.open(file_name, encoding='utf-8') as f:
    text = f.read().split('\n')

chapter_dictionary = dict()
for i in range(len(text)):
    if (text[i].isdigit()):
        chapter_dictionary[text[i]] = text[i + 1]
        for j in range(i + 2, len(text)):
            if not (text[i].isdigit()):
                chapter_dictionary[text[i]].append(text[j])

tokenizer = Tokenizer()
model = Model()

file_output = open("a.txt", "w")

for k, v in chapter_dictionary.items():
    v = ''.join(v)
    data = tokenizer.tokenize_sentence(v)
    clusters = model.fit(data)
    summarizer = Summarizer(clusters)
    file_output.write(summarizer.generate() + "\n")