コード例 #1
0
    def generateGraph(text, outputfile, outputdir, plotGraph=False):
        print('Generating Graph...')
        #Start by doing statistical parsing/tagging for
        temp_file = os.path.join(outputdir, 'temp.json')
        path_stage1 = os.path.join(outputdir,
                                   outputfile.split("_")[0] + '_o1.json')
        txtToJson.textTojson(text, temp_file)
        with open(path_stage1, 'w') as f:
            for graf in pytextrank.parse_doc(pytextrank.json_iter(temp_file)):
                f.write("%s\n" % pytextrank.pretty_print(graf._asdict()))

        #Collect and Normalize the key sentences from the parsed doc
        graph, ranks = pytextrank.text_rank(path_stage1)
        pytextrank.render_ranks(graph, ranks)
        #path_stage2 = path_stage1.replace('o1', 'o2')
        path_stage2 = os.path.join(outputdir, outputfile)
        try:
            os.remove(outputfile)
        except OSError:
            pass
        with open(path_stage2, 'w') as f:
            for rl in pytextrank.normalize_key_phrases(path_stage1, ranks):
                f.write("%s\n" % pytextrank.pretty_print(rl._asdict()))
                #print(pytextrank.pretty_print(rl))
        try:
            os.remove(temp_file)
        except OSError:
            pass

        if plotGraph:
            matplotlib.rcParams['figure.figsize'] = (15.0, 15.0)
            networkx.draw_networkx(graph)
            plt.show()
            nx.draw(graph, with_labels=True)
            plt.show()
コード例 #2
0
ファイル: try2.py プロジェクト: shreyks/Megathon2019
def one(text):

    path_stage0 = "tempfile.json"
    path_stage1 = "o1.json"
    path_stage2 = "o2.json"

    f = open("tempfile.json", "w")
    f.write("{\"id\":\"777\", \"text\":\"" + text + "\"}")
    f.close()

    with open(path_stage1, 'w') as f:
        for graf in parse_doc(json_iter(path_stage0)):
            f.write("%s\n" % pretty_print(graf._asdict()))

    graph, ranks = text_rank(path_stage1)
    render_ranks(graph, ranks)

    outputs = []
    with open(path_stage2, 'w') as f:
        for rl in normalize_key_phrases(path_stage1, ranks):
            ans = "%s\n" % pretty_print(rl._asdict())
            output = ast.literal_eval(ans)
            outputs.append((output["text"], output["rank"]))

    os.remove("tempfile.json")

    return outputs


# text = "The earliest recorded model for planetary motions proposed by Ptolemy about 2000 years ago was a ‘geocentric’ model in which all celestial objects, stars, the sun and the planets, all revolved around the earth."
# print (one("The earliest recorded model for planetary motions proposed by Ptolemy about 2000 years ago was a ‘geocentric’ model in which all celestial objects, stars, the sun and the planets, all revolved around the earth."))
コード例 #3
0
def stage_2():
    cur_dir = os.path.dirname(__file__)
    data_dir = stage_1_dir
    ids = os.listdir(data_dir)

    result_dir = stage_2_dir
    if os.path.exists(result_dir):
        shutil.rmtree(result_dir, ignore_errors=True)
    os.mkdir(result_dir)
    os.chdir(result_dir)

    if not os.path.exists('pictures'):
        os.mkdir('pictures')

    for cur_id in ids:
        if os.path.exists(cur_id):
            continue

        cur_file_name = data_dir + "\\" + cur_id
        print(cur_id)
        graph, ranks = pytextrank.text_rank(cur_file_name)
        pytextrank.render_ranks(graph, ranks, cur_id)

        with codecs.open(cur_id, "w+", "utf_8_sig") as file:
            for rl in pytextrank.normalize_key_phrases(cur_file_name, ranks):
                file.write("%s\n" % pytextrank.pretty_print(rl._asdict()))

    os.chdir(cur_dir)
コード例 #4
0
    def collect_and_normalise_key_phrases(self, paragraph_output,
                                          key_phrases_output):
        """
            Collect and normalise key phrases from the sentences in
            the paragraph (in the JSON doc)
            Rank them using PyTextRank, return a graph and ranked tokens

            Parameters
            ==========
            paragraph_output:
               tagged and parsed JSON document as text file
            key_phrases_output:
               output text file (JSON) into which key phrases are stored

            Return
            ======
            Returns a graph (object) and ranked tokens (dictionary)
        """

        graph, token_ranks = pytextrank.text_rank(paragraph_output)
        pytextrank.render_ranks(graph, token_ranks)

        with open(key_phrases_output, 'w') as temp_file:
            for relationship in pytextrank.normalize_key_phrases(
                    paragraph_output, token_ranks):
                temp_file.write(
                    "%s\n" % pytextrank.pretty_print(relationship._asdict()))

        return graph, token_ranks
コード例 #5
0
def stage2(path_stage1, path_stage2):
    #Stage 2
    graph, ranks = pytextrank.text_rank(path_stage1)
    pytextrank.render_ranks(graph, ranks)

    with open(path_stage2, 'w') as f:
        for rl in pytextrank.normalize_key_phrases(path_stage1, ranks):
            f.write("%s\n" % pytextrank.pretty_print(rl._asdict()))
コード例 #6
0
    def summarize(self, _id, content_text, word_limit):
        self.logger.log("_id: " + _id)
        self.logger.log("word_limit: " + str(word_limit))

        # File names
        path_stage0 = 'process/' + _id + '.json'
        path_stage1 = 'process/' + _id + '_o1.json'
        path_stage2 = 'process/' + _id + '_o2.json'
        path_stage3 = 'process/' + _id + '_o3.json'
        path_stage4 = 'process/' + _id + '_o4.json'

        # Create input file
        with open(path_stage0, 'w') as outfile:
            json.dump({"id": "123", "text": content_text}, outfile)

        # Statistical Parsing - Stage 1
        # Perform statistical parsing/tagging on a document in JSON format
        with open(path_stage1, 'w') as f:
            for graf in pytextrank.parse_doc(
                    pytextrank.json_iter(path_stage0)):
                f.write("%s\n" % pytextrank.pretty_print(graf._asdict()))

        # Ranked Keyphrases - Stage 2
        # Collect and normalize the key phrases from a parsed document
        graph, ranks = pytextrank.text_rank(path_stage1)
        pytextrank.render_ranks(graph, ranks)

        with open(path_stage2, 'w') as f:
            for rl in pytextrank.normalize_key_phrases(path_stage1, ranks):
                f.write("%s\n" % pytextrank.pretty_print(rl._asdict()))

        # Extractive Summarization -  Stage 3
        # Calculate a significance weight for each sentence, using MinHash to approximate a Jaccard distance from key phrases determined by TextRank
        kernel = pytextrank.rank_kernel(path_stage2)

        with open(path_stage3, 'w') as f:
            for s in pytextrank.top_sentences(kernel, path_stage1):
                f.write(pytextrank.pretty_print(s._asdict()))
                f.write("\n")

        # Final Output - Stage 4
        # Summarize a document based on most significant sentences and key phrases
        phrases = ", ".join(
            set([
                p for p in pytextrank.limit_keyphrases(path_stage2,
                                                       phrase_limit=12)
            ]))
        sent_iter = sorted(pytextrank.limit_sentences(path_stage3,
                                                      word_limit=word_limit),
                           key=lambda x: x[1])
        s = []

        for sent_text, idx in sent_iter:
            s.append(pytextrank.make_sentence(sent_text))

        graf_text = " ".join(s)

        return {'excerpts': graf_text, 'keywords': phrases}
コード例 #7
0
def pred_net(sample_case):

    import numpy as np, keras
    from pathlib import Path
    from spacy import displacy
    from PIL import Image
    import json, pytextrank, networkx as nx
    import matplotlib.pyplot as plt

    path_stage0 = "o0.json"
    path_stage1 = "o1.json"

    file_dic = {"id": 0, "text": sample_case}
    loaded_file_dic = json.loads(json.dumps(file_dic))

    with open(path_stage0, 'w') as outfile:
        json.dump(loaded_file_dic, outfile)

    with open(path_stage1, 'w') as f:
        for graf in pytextrank.parse_doc(pytextrank.json_iter(path_stage0)):
            f.write("%s\n" % pytextrank.pretty_print(graf._asdict()))
            print(pytextrank.pretty_print(graf._asdict()))

    graph, ranks = pytextrank.text_rank(path_stage1)
    pytextrank.render_ranks(graph, ranks)

    nx.draw(graph, with_labels=True)
    plt.savefig("sample_case.png", dpi=200, format='png', bbox_inches='tight')
    plt.close()

    im = Image.open("sample_case.png").convert('L').resize((300, 200))
    sample_image = np.array([np.array(im)])
    sample_image = sample_image.reshape(sample_image.shape[0],
                                        sample_image.shape[1],
                                        sample_image.shape[2], 1)

    model = keras.models.load_model("graph_conv_autoencoder.hdf5")

    y_pred = model.predict(sample_image)
    labels = [
        'Major Depressive Disorder',
        'Attention Deficit Hyperactivity Disorder',
        'Oppositional Defiant Disorder', 'Conduct Disorder',
        'Pervasive Developmental Disorder',
        'Intellectual Disability (Mental Retardation)', 'Psychotic Disorder',
        'Adjustment Disorder', 'Mood Disorder', 'General Anxiety Disorder',
        'Social Anxiety Disorder', 'Seasonal Affective Disorder',
        'Substance Abuse', 'Autism Spectrum Disorder'
    ]

    max1 = labels[np.argmax(y_pred)]

    with open('external_resources.json') as data_file:
        for v in json.load(data_file):
            if v['diagnosis'] == max1:
                about1, treatment1 = v['about'], v['treatment']

    return (max1, about1, treatment1)
def execute_stage_two(path_stage1):
    graph, ranks = pytextrank.text_rank(os.path.join(PATH_PREFIX, path_stage1))
    pytextrank.render_ranks(graph, ranks)
    path_name_components = path_stage1.split('.')
    path_name_components[path_name_components.index('stage1')] = 'stage2'
    path_stage2 = '-'.join(path_name_components)
    with open(os.path.join(PATH_PREFIX, path_stage2), 'w') as f:
        for rl in pytextrank.normalize_key_phrases(os.path.join(PATH_PREFIX, path_stage1),
                                                   ranks, stopwords=stopwords):
            f.write("%s\n" % pytextrank.pretty_print(rl._asdict()))
    return path_stage2
コード例 #9
0
    def collect_and_normalise_key_phrases(self, paragraph_output,
                                          key_phrases_output):
        graph, token_ranks = pytextrank.text_rank(paragraph_output)
        pytextrank.render_ranks(graph, token_ranks)

        with open(key_phrases_output, 'w') as f:
            for relationship in pytextrank.normalize_key_phrases(
                    paragraph_output, token_ranks):
                f.write("%s\n" %
                        pytextrank.pretty_print(relationship._asdict()))

        return graph, token_ranks
コード例 #10
0
    def collect_and_normalise_key_phrases(self, paragraph_output, key_phrases_output):
        """
            Collect and normalise key phrases from the sentences in
            the paragraph (in the JSON doc)
            Rank them using PyTextRank, return a graph and ranked tokens
        """

        graph, token_ranks = pytextrank.text_rank(paragraph_output)
        pytextrank.render_ranks(graph, token_ranks)

        with open(key_phrases_output, 'w') as temp_file:
            for relationship in pytextrank.normalize_key_phrases(paragraph_output, token_ranks):
                temp_file.write("%s\n" % pytextrank.pretty_print(relationship._asdict()))

        return graph, token_ranks
コード例 #11
0
def summarize_text(input_file):
    # seriously f**k this API
    path_stage0 = input_file
    path_stage1 = 'stage1.txt'
    with open(path_stage1, 'w') as f:
        for graf in pytextrank.parse_doc(pytextrank.json_iter(path_stage0)):
            f.write("%s\n" % pytextrank.pretty_print(graf._asdict()))
            # to view output in this notebook
            #print(pytextrank.pretty_print(graf))

    graph, ranks = pytextrank.text_rank(path_stage1)
    pytextrank.render_ranks(graph, ranks)

    path_stage2 = 'stage2.txt'
    with open(path_stage2, 'w') as f:
        for rl in pytextrank.normalize_key_phrases(path_stage1, ranks):
            f.write("%s\n" % pytextrank.pretty_print(rl._asdict()))
            # to view output in this notebook
            #print(pytextrank.pretty_print(rl))

    path_stage3 = 'stage3.txt'
    kernel = pytextrank.rank_kernel(path_stage2)

    with open(path_stage3, 'w') as f:
        for s in pytextrank.top_sentences(kernel, path_stage1):
            f.write(pytextrank.pretty_print(s._asdict()))
            f.write("\n")
            # to view output in this notebook
            #print(pytextrank.pretty_print(s._asdict()))

    phrases = ", ".join(
        set([
            p
            for p in pytextrank.limit_keyphrases(path_stage2, phrase_limit=12)
        ]))
    sent_iter = sorted(pytextrank.limit_sentences(path_stage3, word_limit=120),
                       key=lambda x: x[1])
    s = []

    for sent_text, idx in sent_iter:
        s.append(pytextrank.make_sentence(sent_text))

    graf_text = " ".join(s)
    #print("**excerpts:** %s\n\n**keywords:** %s" % (graf_text, phrases,))

    return ' '.join(s)
コード例 #12
0
def _get_keywords(path_stage0, path_stage2):
    # Stage 1: parse doc
    path_stage1 = 'o1.json'
    with open(path_stage1, 'w') as f:
        for graf in pytextrank.parse_doc(pytextrank.json_iter(path_stage0)):
            f.write("%s\n" % pytextrank.pretty_print(graf._asdict()))

    # Stage 2: rank words
    graph, ranks = pytextrank.text_rank(path_stage1)
    pytextrank.render_ranks(graph, ranks)

    result_dict = dict()
    with open(path_stage2, 'w') as f2:
        for rl in pytextrank.normalize_key_phrases(path_stage1, ranks):
            _ro = rl._asdict()
            ro = dict()
            ro[_ro['text']] = _ro['rank']
            #f2.write("%s\n" % pytextrank.pretty_print(ro))

            result_dict[_ro['text']] = _ro['rank']

    return result_dict
コード例 #13
0
def rank_bill(bill):
    bill_id = bill['bill_id']
    with open(prefix + '/{}_stage1'.format(bill_id), 'w') as f:
        for graf in parse_doc([bill]):
            f.write(pretty_print(graf._asdict()))
            f.write('\n')

    path_stage1 = prefix + '/{}_stage1'.format(bill_id)

    graph, ranks = text_rank(path_stage1)
    render_ranks(graph, ranks)

    for rl in normalize_key_phrases(path_stage1, ranks):
        output = pretty_print(rl._asdict())
        with open(prefix + '/{}_stage2'.format(bill_id), 'w') as f:
            f.write(output)

    path_stage1 = prefix + '/{}_stage1'.format(bill_id)
    path_stage2 = prefix + '/{}_stage2'.format(bill_id)

    kernel = rank_kernel(path_stage2)
    with open(prefix + '/{}_stage3'.format(bill_id), 'w') as f:
        for s in top_sentences(kernel, path_stage1):
            f.write(pretty_print(s._asdict()))
コード例 #14
0
def do_pytextrank(data):
    for item in data:
        for subItem in data[item]:
            print('###############')
            print('description:', subItem['description'])

            # using pytextrank
            # reference https://github.com/ceteri/pytextrank/issues/18

            # raw input
            subItemJSON = {'id': subItem['id'], 'text': subItem['description']}
            subItemJSON = json.dumps(subItemJSON)
            with open('sub_item.json', 'w') as outFile:
                outFile.write(subItemJSON)

            # stage 1
            with open('stage1_output.json', 'w') as outFile:
                for graf in pytextrank.parse_doc(
                        pytextrank.json_iter('sub_item.json')):
                    outFile.write("%s\n" %
                                  pytextrank.pretty_print(graf._asdict()))

            # stage 2
            graph, ranks = pytextrank.text_rank('stage1_output.json')
            pytextrank.render_ranks(graph, ranks)
            rlLists = []
            print('key phrases:')
            with open('stage2_output.json', 'w') as outFile:
                for rl in pytextrank.normalize_key_phrases(
                        'stage1_output.json', ranks):
                    rlList = eval(pytextrank.pretty_print(rl))
                    rlLists.append(rlList)
                    print(rlList)

            # cleanup
            os.system(
                'rm -f sub_item.json stage1_output.json stage2_output.json graph.dot'
            )

            # input filter results based on pos
            # this is a heuristic
            filteredRlLists = [x for x in rlLists if 'nn' not in x[-2]]
            if (len(filteredRlLists) == 0):
                # invalid case
                continue
            else:
                [heuristic, iOItem] = do_heuristic(subItem, filteredRlLists)
                print('heuristic:', heuristic)
                print('i/o input:', iOItem)

            # input filter results based on pos
            # this is a heuristic
            filteredRlLists = [x for x in rlLists if 'nn' in x[-2]]
            if (len(filteredRlLists) == 0):
                # invalid case
                continue
            else:
                [heuristic, iOItem] = do_heuristic(subItem, filteredRlLists)
                print('heuristic:', heuristic)
                print('i/o output:', iOItem)

            print('###############')
コード例 #15
0
def extract_phrasesfrom_textrank(corpus):
    record_data = pd.DataFrame({'sentences': corpus})
    record_data = pd.DataFrame({
        'id': record_data.index.tolist(),
        'text': record_data['sentences'].tolist()
    })
    tweet_items = []
    for jdict in record_data.to_dict(orient='records'):
        tweet_items.append(jdict)

    new_df_tweet = pd.DataFrame(columns=['text', 'keywords'])
    path_stage1 = "celebrity1_tweet.json"
    path_stage2 = "celebrity2_tweet.json"
    path_stage3 = "celebrity3_tweet.json"
    for item in tweet_items:
        items_new = [item]
        with open(path_stage1, 'w') as f:
            for graf in pytextrank.parse_doc(items_new):
                f.write("%s\n" % pytextrank.pretty_print(graf._asdict()))

        graph, ranks = pytextrank.text_rank(path_stage1)
        pytextrank.render_ranks(graph, ranks)

        with open(path_stage2, 'w') as f:
            for rl in pytextrank.normalize_key_phrases(path_stage1, ranks):
                f.write("%s\n" % pytextrank.pretty_print(rl._asdict()))

        kernel = pytextrank.rank_kernel(path_stage2)

        with open(path_stage3, 'w') as f:
            for s in pytextrank.top_sentences(kernel, path_stage1):
                f.write(pytextrank.pretty_print(s._asdict()))
                f.write("\n")
        phrases = ", ".join(
            set([
                p for p in pytextrank.limit_keyphrases(path_stage2,
                                                       phrase_limit=5)
            ]))
        sent_iter = sorted(pytextrank.limit_sentences(path_stage3,
                                                      word_limit=150),
                           key=lambda x: x[1])
        s = []

        for sent_text, idx in sent_iter:
            s.append(pytextrank.make_sentence(sent_text))

        graf_text = " ".join(s)
        new_df_tweet = new_df_tweet.append(
            {
                'text': item.get('text'),
                'keywords': phrases
            }, ignore_index=True)

    celeb_list = [
        'Bradley Cooper', 'Chris Kyle', 'Clint Eastwood', 'bradley cooper',
        'bradley', 'cooper', 'chris kyle', 'chris', 'kyle', 'clint eastwood',
        'clint', 'eastwood'
    ]

    cleaned_df_tweet = pd.DataFrame(columns=['sentences', 'keywords'])
    for index, row in new_df_tweet.iterrows():
        if any(celeb in row['keywords'] for celeb in celeb_list):
            cleaned_df_tweet = cleaned_df_tweet.append(
                {
                    'sentences': row['text'],
                    'keywords': row['keywords']
                },
                ignore_index=True)

    cleaned_df_tweet.to_csv(phrase_filepath,
                            sep=',',
                            encoding='utf-8',
                            index=False)
    new_df_tweet.to_csv(all_phrasefile_path,
                        sep=',',
                        encoding='utf-8',
                        index=False)
    return new_df_tweet, cleaned_df_tweet
コード例 #16
0
#!/usr/bin/env python
# encoding: utf-8

from pytextrank import normalize_key_phrases, pretty_print, render_ranks, text_rank
import sys

## Stage 2:
##  * collect and normalize the key phrases from a parsed document
##
## INPUTS: <stage1>
## OUTPUT: JSON format `RankedLexeme(text, rank, ids, pos)`

if __name__ == "__main__":
    path_stage1 = sys.argv[1]

    graph, ranks = text_rank(path_stage1)
    render_ranks(graph, ranks)

    for rl in normalize_key_phrases(path_stage1, ranks):
        # print (rl)
        print(pretty_print(rl._asdict()))
コード例 #17
0
# Stage 1

path_stage0 = "../tests/pytextrank_dat/mih.json"
path_stage1 = "../tests/pytextrank_dat/o1.json"

with open(path_stage1, 'w') as f:
    for graf in ptr.parse_doc(ptr.json_iter(path_stage0)):
        f.write("%s\n" % ptr.pretty_print(graf._asdict()))
        print(ptr.pretty_print(graf))

# Stage 2
path_stage2 = "../tests/pytextrank_dat/o2.json"

graph, ranks = ptr.text_rank(path_stage1)
ptr.render_ranks(graph, ranks)

with open(path_stage2, 'w') as f:
    for rl in ptr.normalize_key_phrases(path_stage1, ranks):
        f.write("%s\n" % ptr.pretty_print(rl._asdict()))
        print(ptr.pretty_print(rl))

# Stage 3
import networkx as nx
# import pylab as plt

nx.draw(graph, with_labels=True)
# plt.show()

path_stage3 = "../tests/pytextrank_dat/o3.json"
コード例 #18
0
def keyPhrases():
    graph, ranks = pytextrank.text_rank('temp2.json')
    pytextrank.render_ranks(graph, ranks)
    with open('temp3.json', 'w') as f:
        for rl in pytextrank.normalize_key_phrases('temp2.json', ranks):
            f.write("%s\n" % pytextrank.pretty_print(rl._asdict()))
コード例 #19
0
    json.dump(loaded_file_dic, outfile)

path_stage0 = "test.json"
path_stage1 = "o1.json"

# Extract keyword using pytextrank
with open(path_stage1, 'w') as f:
    for graf in pytextrank.parse_doc(pytextrank.json_iter(path_stage0)):
        f.write("%s\n" % pytextrank.pretty_print(graf._asdict()))
        #print(pytextrank.pretty_print(graf._asdict()))

path_stage1 = "o1.json"
path_stage2 = "o2.json"

graph, ranks = pytextrank.text_rank(path_stage1)
pytextrank.render_ranks(graph, ranks)

with open(path_stage2, 'w') as f:
    for rl in pytextrank.normalize_key_phrases(path_stage1, ranks):
        f.write("%s\n" % pytextrank.pretty_print(rl._asdict()))
        #print(pytextrank.pretty_print(rl))

path_stage1 = "o1.json"
path_stage2 = "o2.json"
path_stage3 = "o3.json"

kernel = pytextrank.rank_kernel(path_stage2)

with open(path_stage3, 'w') as f:
    for s in pytextrank.top_sentences(kernel, path_stage1):
        f.write(pytextrank.pretty_print(s._asdict()))
コード例 #20
0
def insert_key_phrases_into_db(list_of_doc_dicts, doctype, collection):
    '''
    Takes in list of doc dictionaries and a doctype ('comment' or 'post'), 
    processes each doc with PyTextRank, obtains key phrases and 
    inserts key phrases into document in Mongodb as 'key_phrases' field.
    '''
    path_stage0 = 'stage0.json'
    path_stage1 = 'stage1.json'
    path_stage2 = 'stage2.json'
    path_stage3 = 'stage3.json'

    total_docs = len(list_of_doc_dicts)

    failed_ids = []
    for i, doc_dict in enumerate(list_of_doc_dicts):
        if i % 50 == 0:
            print(f'processing {i} of {total_docs} documents')
        doc_dict['text'] = doc_dict['text'].split('\n_____\n\n')[0]

        try:
            with open(path_stage0, 'w') as f:
                json.dump(doc_dict, f)
            # Stage 1
            with open(path_stage1, 'w') as f:
                for graf in pytextrank.parse_doc(
                        pytextrank.json_iter(path_stage0)):
                    f.write("%s\n" % pytextrank.pretty_print(graf._asdict()))
                    # print(pytextrank.pretty_print(graf))
            # Stage 2
            graph, ranks = pytextrank.text_rank(path_stage1)
            pytextrank.render_ranks(graph, ranks)
            with open(path_stage2, 'w') as f:
                for rl in pytextrank.normalize_key_phrases(path_stage1, ranks):
                    f.write("%s\n" % pytextrank.pretty_print(rl._asdict()))
                    # to view output in this notebook
                    # print(pytextrank.pretty_print(rl))
            # Stage 3
            kernel = pytextrank.rank_kernel(path_stage2)
            with open(path_stage3, 'w') as f:
                for s in pytextrank.top_sentences(kernel, path_stage1):
                    f.write(pytextrank.pretty_print(s._asdict()))
                    f.write("\n")
                    # to view output in this notebook
                    # print(pytextrank.pretty_print(s._asdict()))
            # Stage 4
            phrase_list = list(
                set([
                    p for p in pytextrank.limit_keyphrases(path_stage2,
                                                           phrase_limit=15)
                ]))
            phrases = ", ".join(phrase_list)

            sent_iter = sorted(pytextrank.limit_sentences(path_stage3,
                                                          word_limit=150),
                               key=lambda x: x[1])
            s = []

            for sent_text, idx in sent_iter:
                s.append(pytextrank.make_sentence(sent_text))

            graf_text = " ".join(s)
            collection.update_one({f'{doctype}_id': {
                '$eq': doc_dict['id']
            }}, {'$set': {
                'key_phrases': phrase_list
            }})
        except:
            failed_ids.append(doc_dict['id'])
            print('failed on ', doc_dict['id'])
            continue