Example #1
0
def text_rank(text, language):
    sentences = []
    words = []

    if (language == 'ukrainian'):
        morph = MorphAnalyzer(lang='uk')
        sentences = sent_tokenizer_ua(text)
        if len(sentences) < 2:
            s = sentences[0]
            return [(1, 0, s)]
        words = [set(morph.parse(word)[0].normalized for word in word_tokenizer.tokenize(sentence.lower())
                    if word not in stop_words_ua) for sentence in sentences]
    else:
        morph = MorphAnalyzer()
        sentences = sent_tokenizer_ru(text)
        if len(sentences) < 2:
            s = sentences[0]
            return [(1, 0, s)]
        words = [set(morph.parse(word)[0].normalized for word in word_tokenizer.tokenize(sentence.lower())
                     if word not in stop_words_ru) for sentence in sentences]

    pairs = combinations(range(len(sentences)), 2)
    scores = [(i, j, similarity(words[i], words[j])) for i, j in pairs]
    scores = filter(lambda x: x[2], scores)
    pr = rank_graph(scores)

    return sorted(((i, pr[i], s) for i, s in enumerate(sentences) if i in pr),
                  key=lambda x: pr[x[0]], reverse=True)
def text_rank(text, language):
    sentences = []
    a = []
    if (language == 'ukrainian'):
        morph = MorphAnalyzer(lang='uk')
        sentences = sent_tokenizer_ua(text)
        if len(sentences) < 2:
            s = sentences[0]
            return [(1, 0, s)]
        a = tfidf(text, language, sent_tokenizer_ua, stop_words_ua)
    else:
        morph = MorphAnalyzer()
        sentences = sent_tokenizer_ru(text)
        if len(sentences) < 2:
            s = sentences[0]
            return [(1, 0, s)]
        a = tfidf(text, language, sent_tokenizer_ru, stop_words_ru)

    pairs = combinations(range(len(sentences)), 2)
    scores = [(i, j, similarity(a[i, :], a[j, :])) for i, j in pairs]
    scores = filter(lambda x: x[2], scores)

    pr = rank_graph(scores)

    return sorted(((i, pr[i], s) for i, s in enumerate(sentences) if i in pr),
                  key=lambda x: pr[x[0]], reverse=True)  # Сортировка по убыванию ранга тройки
Example #3
0
def test(language):
    filename = 'ru_gazeta.csv'
    if language == 'ukrainian':
        filename = 'ua_gazeta.csv'
    testData = []

    with io.open(filename, 'r', encoding="utf-8") as file:
        for row in csv.reader(file):
            testData.append({"id": row[0], "title": row[1], "text": row[2]})

    row_list = [["Id", "Title", "Body", "Summary"]]
    for testDataRow in testData:
        testedText = testDataRow["text"]
        result = ''
        if language == 'ukrainian':
            sentences = sent_tokenizer_ua(testedText)
            if len(sentences) < 20:
                result = GeneralSummarizer.summarize(testedText, language, 2)
            else:
                result = GeneralSummarizer.summarize(testedText, language,
                                                     len(sentences) // 10)
        else:
            sentences = sent_tokenizer_ru(testedText)
            if len(sentences) < 20:
                result = GeneralSummarizer.summarize(testedText, language, 2)
            else:
                result = GeneralSummarizer.summarize(testedText, language,
                                                     len(sentences) // 10)
        testDataRow["result"] = result
        row_list.append([
            testDataRow["id"], testDataRow["title"], testDataRow["text"],
            testDataRow["result"]
        ])

    with io.open('result' + filename, 'w', newline='',
                 encoding="utf-8") as file:
        writer = csv.writer(file,
                            delimiter=',',
                            quotechar='"',
                            quoting=csv.QUOTE_ALL)
        writer.writerows(row_list)
Example #4
0
def test(language):
    filename = 'RU_dataset.csv'
    if language == 'ukrainian':
        filename = 'UA_dataset_full.csv'
    testData = []

    with io.open(filename, 'r', encoding="utf-8") as file:
        for row in csv.reader(file):
            testData.append({"id": row[0], "title": row[1], "text": row[2]})

    row_list = [[
        "Id", "Title", "Body", "Summary20", "Cosine20", "Summary40", "Cosine40"
    ]]
    for testDataRow in testData:
        testedText = testDataRow["text"]
        result20 = ''
        result40 = ''
        if language == 'ukrainian':
            sentences = sent_tokenizer_ua(testedText)
            if len(sentences) < 10:
                result20 = GeneralSummarizer.summarize(testedText, language, 2)
                if (len(sentences) < 5):
                    result40 = GeneralSummarizer.summarize(
                        testedText, language, 2)
                else:
                    result40 = GeneralSummarizer.summarize(
                        testedText, language, 4)
            elif len(sentences) < 20:
                result20 = GeneralSummarizer.summarize(testedText, language, 4)
                result40 = GeneralSummarizer.summarize(testedText, language, 8)
            else:
                result20 = GeneralSummarizer.summarize(testedText, language,
                                                       len(sentences) // 5)
                result40 = GeneralSummarizer.summarize(testedText, language,
                                                       len(sentences) // 3)
        else:
            sentences = sent_tokenizer_ru(testedText)
            if len(sentences) < 10:
                result20 = GeneralSummarizer.summarize(testedText, language, 2)
                if (len(sentences) < 5):
                    result40 = GeneralSummarizer.summarize(
                        testedText, language, 2)
                else:
                    result40 = GeneralSummarizer.summarize(
                        testedText, language, 4)
            elif len(sentences) < 20:
                result20 = GeneralSummarizer.summarize(testedText, language, 4)
                result40 = GeneralSummarizer.summarize(testedText, language, 8)
            else:
                result20 = GeneralSummarizer.summarize(testedText, language,
                                                       len(sentences) // 5)
                result40 = GeneralSummarizer.summarize(testedText, language,
                                                       len(sentences) // 3)

        cosine20 = get_cosine(text_to_vector(result20),
                              text_to_vector(testedText))
        cosine40 = get_cosine(text_to_vector(result40),
                              text_to_vector(testedText))
        testDataRow["result20"] = result20
        testDataRow["result40"] = result40
        row_list.append([
            testDataRow["id"], testDataRow["title"], testDataRow["text"],
            testDataRow["result20"],
            str(cosine20), testDataRow["result40"],
            str(cosine40)
        ])

    with io.open('result' + filename, 'w', newline='',
                 encoding="utf-8") as file:
        writer = csv.writer(file,
                            delimiter=',',
                            quotechar='"',
                            quoting=csv.QUOTE_ALL)
        writer.writerows(row_list)