def collect_and_normalise_key_phrases(self, paragraph_output,
                                          key_phrases_output):
        """
            Collect and normalise key phrases from the sentences in
            the paragraph (in the JSON doc)
            Rank them using PyTextRank, return a graph and ranked tokens

            Parameters
            ==========
            paragraph_output:
               tagged and parsed JSON document as text file
            key_phrases_output:
               output text file (JSON) into which key phrases are stored

            Return
            ======
            Returns a graph (object) and ranked tokens (dictionary)
        """

        graph, token_ranks = pytextrank.text_rank(paragraph_output)
        pytextrank.render_ranks(graph, token_ranks)

        with open(key_phrases_output, 'w') as temp_file:
            for relationship in pytextrank.normalize_key_phrases(
                    paragraph_output, token_ranks):
                temp_file.write(
                    "%s\n" % pytextrank.pretty_print(relationship._asdict()))

        return graph, token_ranks
Beispiel #2
0
def stage_2():
    cur_dir = os.path.dirname(__file__)
    data_dir = stage_1_dir
    ids = os.listdir(data_dir)

    result_dir = stage_2_dir
    if os.path.exists(result_dir):
        shutil.rmtree(result_dir, ignore_errors=True)
    os.mkdir(result_dir)
    os.chdir(result_dir)

    if not os.path.exists('pictures'):
        os.mkdir('pictures')

    for cur_id in ids:
        if os.path.exists(cur_id):
            continue

        cur_file_name = data_dir + "\\" + cur_id
        print(cur_id)
        graph, ranks = pytextrank.text_rank(cur_file_name)
        pytextrank.render_ranks(graph, ranks, cur_id)

        with codecs.open(cur_id, "w+", "utf_8_sig") as file:
            for rl in pytextrank.normalize_key_phrases(cur_file_name, ranks):
                file.write("%s\n" % pytextrank.pretty_print(rl._asdict()))

    os.chdir(cur_dir)
Beispiel #3
0
    def generateGraph(text, outputfile, outputdir, plotGraph=False):
        print('Generating Graph...')
        #Start by doing statistical parsing/tagging for
        temp_file = os.path.join(outputdir, 'temp.json')
        path_stage1 = os.path.join(outputdir,
                                   outputfile.split("_")[0] + '_o1.json')
        txtToJson.textTojson(text, temp_file)
        with open(path_stage1, 'w') as f:
            for graf in pytextrank.parse_doc(pytextrank.json_iter(temp_file)):
                f.write("%s\n" % pytextrank.pretty_print(graf._asdict()))

        #Collect and Normalize the key sentences from the parsed doc
        graph, ranks = pytextrank.text_rank(path_stage1)
        pytextrank.render_ranks(graph, ranks)
        #path_stage2 = path_stage1.replace('o1', 'o2')
        path_stage2 = os.path.join(outputdir, outputfile)
        try:
            os.remove(outputfile)
        except OSError:
            pass
        with open(path_stage2, 'w') as f:
            for rl in pytextrank.normalize_key_phrases(path_stage1, ranks):
                f.write("%s\n" % pytextrank.pretty_print(rl._asdict()))
                #print(pytextrank.pretty_print(rl))
        try:
            os.remove(temp_file)
        except OSError:
            pass

        if plotGraph:
            matplotlib.rcParams['figure.figsize'] = (15.0, 15.0)
            networkx.draw_networkx(graph)
            plt.show()
            nx.draw(graph, with_labels=True)
            plt.show()
Beispiel #4
0
def one(text):

    path_stage0 = "tempfile.json"
    path_stage1 = "o1.json"
    path_stage2 = "o2.json"

    f = open("tempfile.json", "w")
    f.write("{\"id\":\"777\", \"text\":\"" + text + "\"}")
    f.close()

    with open(path_stage1, 'w') as f:
        for graf in parse_doc(json_iter(path_stage0)):
            f.write("%s\n" % pretty_print(graf._asdict()))

    graph, ranks = text_rank(path_stage1)
    render_ranks(graph, ranks)

    outputs = []
    with open(path_stage2, 'w') as f:
        for rl in normalize_key_phrases(path_stage1, ranks):
            ans = "%s\n" % pretty_print(rl._asdict())
            output = ast.literal_eval(ans)
            outputs.append((output["text"], output["rank"]))

    os.remove("tempfile.json")

    return outputs


# text = "The earliest recorded model for planetary motions proposed by Ptolemy about 2000 years ago was a ‘geocentric’ model in which all celestial objects, stars, the sun and the planets, all revolved around the earth."
# print (one("The earliest recorded model for planetary motions proposed by Ptolemy about 2000 years ago was a ‘geocentric’ model in which all celestial objects, stars, the sun and the planets, all revolved around the earth."))
def text_rank(json_request):
	pattern = re.compile("TI  - (.*?)\\r|AB  - (.*?)\\r")
	matches = re.findall(pattern, json_request['ris'])
	all_inputs = []
	for section in matches:
	       all_inputs.append((''.join([word + ' ' for word in section])).strip())

	input_json = {}
	input_json['id'] = "0"
	input_json['text'] = '.'.join(all_inputs)

	with open('ris_extracted.json', 'w') as output:
	    json.dump(input_json, output)

	with open(path_stage1, 'w') as f:
	    for graf in pytextrank.parse_doc(pytextrank.json_iter('ris_extracted.json')):
	        f.write("%s\n" % pytextrank.pretty_print(graf._asdict()))

	graph, ranks = pytextrank.text_rank(path_stage1)

	with open(path_stage2, 'w') as f:
	    for rl in pytextrank.normalize_key_phrases(path_stage1, ranks):
	        f.write("%s\n" % pytextrank.pretty_print(rl._asdict()))

	phrases = list([p for p in pytextrank.limit_keyphrases(path_stage2, phrase_limit=20)])

	return phrases
Beispiel #6
0
def stage2(path_stage1, path_stage2):
    #Stage 2
    graph, ranks = pytextrank.text_rank(path_stage1)
    pytextrank.render_ranks(graph, ranks)

    with open(path_stage2, 'w') as f:
        for rl in pytextrank.normalize_key_phrases(path_stage1, ranks):
            f.write("%s\n" % pytextrank.pretty_print(rl._asdict()))
Beispiel #7
0
def obj_to_keywords(obj):
    if isinstance(obj, list) == False:
        obj = [obj]
    graphs = list(map(lambda x: x, pytextrank.parse_doc(obj)))
    dicts = list(map(lambda x: x._asdict(), graphs))
    graph, ranks = text_rank(path_stage1)
    keywords = list(map(lambda x: x._asdict(), pytextrank.normalize_key_phrases(dicts, ranks)))
    return keywords
    def summarize(self, _id, content_text, word_limit):
        self.logger.log("_id: " + _id)
        self.logger.log("word_limit: " + str(word_limit))

        # File names
        path_stage0 = 'process/' + _id + '.json'
        path_stage1 = 'process/' + _id + '_o1.json'
        path_stage2 = 'process/' + _id + '_o2.json'
        path_stage3 = 'process/' + _id + '_o3.json'
        path_stage4 = 'process/' + _id + '_o4.json'

        # Create input file
        with open(path_stage0, 'w') as outfile:
            json.dump({"id": "123", "text": content_text}, outfile)

        # Statistical Parsing - Stage 1
        # Perform statistical parsing/tagging on a document in JSON format
        with open(path_stage1, 'w') as f:
            for graf in pytextrank.parse_doc(
                    pytextrank.json_iter(path_stage0)):
                f.write("%s\n" % pytextrank.pretty_print(graf._asdict()))

        # Ranked Keyphrases - Stage 2
        # Collect and normalize the key phrases from a parsed document
        graph, ranks = pytextrank.text_rank(path_stage1)
        pytextrank.render_ranks(graph, ranks)

        with open(path_stage2, 'w') as f:
            for rl in pytextrank.normalize_key_phrases(path_stage1, ranks):
                f.write("%s\n" % pytextrank.pretty_print(rl._asdict()))

        # Extractive Summarization -  Stage 3
        # Calculate a significance weight for each sentence, using MinHash to approximate a Jaccard distance from key phrases determined by TextRank
        kernel = pytextrank.rank_kernel(path_stage2)

        with open(path_stage3, 'w') as f:
            for s in pytextrank.top_sentences(kernel, path_stage1):
                f.write(pytextrank.pretty_print(s._asdict()))
                f.write("\n")

        # Final Output - Stage 4
        # Summarize a document based on most significant sentences and key phrases
        phrases = ", ".join(
            set([
                p for p in pytextrank.limit_keyphrases(path_stage2,
                                                       phrase_limit=12)
            ]))
        sent_iter = sorted(pytextrank.limit_sentences(path_stage3,
                                                      word_limit=word_limit),
                           key=lambda x: x[1])
        s = []

        for sent_text, idx in sent_iter:
            s.append(pytextrank.make_sentence(sent_text))

        graf_text = " ".join(s)

        return {'excerpts': graf_text, 'keywords': phrases}
def pytrankSummarize(filename):
    """
    This is another TextRank algorithm. It works in four stages, each feeding its output to the next
    1. Part-of-Speech Tagging and lemmatization are performed for every sentence in the document.
    2. Key phrases are extracted along with their counts, and are normalized.
    3. Calculates a score for each sentence by approximating jaccard distance between the sentence and key phrases.
    4. Summarizes the document based on most significant sentences and key phrases.
    """

    import pytextrank

    jsonText = createJSON(filename)

    path_stage0 = jsonText
    path_stage1 = "o1.json"

    with open(path_stage1, 'w') as f:
        for graf in pytextrank.parse_doc(pytextrank.json_iter(path_stage0)):
            f.write("%s\n" % pytextrank.pretty_print(graf._asdict()))

    path_stage2 = "o2.json"

    graph, ranks = pytextrank.text_rank(path_stage1)

    with open(path_stage2, 'w') as f:
        for rl in pytextrank.normalize_key_phrases(path_stage1, ranks):
            f.write("%s\n" % pytextrank.pretty_print(rl._asdict()))

    path_stage3 = "o3.json"

    kernel = pytextrank.rank_kernel(path_stage2)

    with open(path_stage3, 'w') as f:
        for s in pytextrank.top_sentences(kernel, path_stage1):
            f.write(pytextrank.pretty_print(s._asdict()))
            f.write("\n")

    phrases = ", ".join(
        set([
            p
            for p in pytextrank.limit_keyphrases(path_stage2, phrase_limit=12)
        ]))
    sent_iter = sorted(pytextrank.limit_sentences(path_stage3, word_limit=50),
                       key=lambda x: x[1])
    s = []

    for sent_text, idx in sent_iter:
        s.append(pytextrank.make_sentence(sent_text))

    graf_text = " ".join(s)

    print("")
    print("####### From PyTextRank #######")
    print("**excerpts:** %s\n\n**keywords:** %s" % (
        graf_text,
        phrases,
    ))
def execute_stage_two(path_stage1):
    graph, ranks = pytextrank.text_rank(os.path.join(PATH_PREFIX, path_stage1))
    pytextrank.render_ranks(graph, ranks)
    path_name_components = path_stage1.split('.')
    path_name_components[path_name_components.index('stage1')] = 'stage2'
    path_stage2 = '-'.join(path_name_components)
    with open(os.path.join(PATH_PREFIX, path_stage2), 'w') as f:
        for rl in pytextrank.normalize_key_phrases(os.path.join(PATH_PREFIX, path_stage1),
                                                   ranks, stopwords=stopwords):
            f.write("%s\n" % pytextrank.pretty_print(rl._asdict()))
    return path_stage2
    def collect_and_normalise_key_phrases(self, paragraph_output,
                                          key_phrases_output):
        graph, token_ranks = pytextrank.text_rank(paragraph_output)
        pytextrank.render_ranks(graph, token_ranks)

        with open(key_phrases_output, 'w') as f:
            for relationship in pytextrank.normalize_key_phrases(
                    paragraph_output, token_ranks):
                f.write("%s\n" %
                        pytextrank.pretty_print(relationship._asdict()))

        return graph, token_ranks
 def generate_phrases(self):
     """From the graph, take the phrases, with their count, rank"""
     _, ranks = pytextrank.text_rank(self.graphs)
     for normal_phrase in pytextrank.normalize_key_phrases(
             self.graphs,
             ranks,
             stopwords=[
                 "not_a_word______",
             ]
             # Setting that stopword is needed, because we dont want
             # that pytextrank, remove stopwords at this point.
     ):
         self.all_phrases.append(dict(normal_phrase._asdict()))
Beispiel #13
0
def stage_2_multiprocess(args):
    fake_json_graph_dicts = args[0]
    ranks = args[1]
    thread_num = args[2]
    rl_fake_json = []
    for rl in pytextrank.normalize_key_phrases(fake_json_graph_dicts,
                                               ranks,
                                               stopwords=RAKE.SmartStopList()):
        print(pytextrank.pretty_print(rl))
        rl_fake_json.append([rl._asdict()])
    stage_2_filename = "{publisher}_{version}_textRank_{thread_num}_rl.Stage2.".format(
        version=version, thread_num=thread_num, publisher=publisher)
    stage_2_out = os.path.join(directory, "Stage2Results", stage_2_filename)
    pickle.dump(rl_fake_json, open(stage_2_out, 'wb'))
Beispiel #14
0
    def collect_and_normalise_key_phrases(self, paragraph_output, key_phrases_output):
        """
            Collect and normalise key phrases from the sentences in
            the paragraph (in the JSON doc)
            Rank them using PyTextRank, return a graph and ranked tokens
        """

        graph, token_ranks = pytextrank.text_rank(paragraph_output)
        pytextrank.render_ranks(graph, token_ranks)

        with open(key_phrases_output, 'w') as temp_file:
            for relationship in pytextrank.normalize_key_phrases(paragraph_output, token_ranks):
                temp_file.write("%s\n" % pytextrank.pretty_print(relationship._asdict()))

        return graph, token_ranks
Beispiel #15
0
def summarize_text(input_file):
    # seriously f**k this API
    path_stage0 = input_file
    path_stage1 = 'stage1.txt'
    with open(path_stage1, 'w') as f:
        for graf in pytextrank.parse_doc(pytextrank.json_iter(path_stage0)):
            f.write("%s\n" % pytextrank.pretty_print(graf._asdict()))
            # to view output in this notebook
            #print(pytextrank.pretty_print(graf))

    graph, ranks = pytextrank.text_rank(path_stage1)
    pytextrank.render_ranks(graph, ranks)

    path_stage2 = 'stage2.txt'
    with open(path_stage2, 'w') as f:
        for rl in pytextrank.normalize_key_phrases(path_stage1, ranks):
            f.write("%s\n" % pytextrank.pretty_print(rl._asdict()))
            # to view output in this notebook
            #print(pytextrank.pretty_print(rl))

    path_stage3 = 'stage3.txt'
    kernel = pytextrank.rank_kernel(path_stage2)

    with open(path_stage3, 'w') as f:
        for s in pytextrank.top_sentences(kernel, path_stage1):
            f.write(pytextrank.pretty_print(s._asdict()))
            f.write("\n")
            # to view output in this notebook
            #print(pytextrank.pretty_print(s._asdict()))

    phrases = ", ".join(
        set([
            p
            for p in pytextrank.limit_keyphrases(path_stage2, phrase_limit=12)
        ]))
    sent_iter = sorted(pytextrank.limit_sentences(path_stage3, word_limit=120),
                       key=lambda x: x[1])
    s = []

    for sent_text, idx in sent_iter:
        s.append(pytextrank.make_sentence(sent_text))

    graf_text = " ".join(s)
    #print("**excerpts:** %s\n\n**keywords:** %s" % (graf_text, phrases,))

    return ' '.join(s)
Beispiel #16
0
def text_ranking(video_seg_id, book_segment):
    """
    :param book_segment: book segment in json format
    :return: key sentences and key phrases
    """
    # os.chdir(video_path)
    # creating directory to store segments for clean structure
    if not os.path.exists('TextRank_data'):
        os.mkdir('TextRank_data')
    if not os.path.exists('TextRank_data/seg' + str(video_seg_id)):
        os.mkdir('TextRank_data/seg' + str(video_seg_id))
    subdir = 'TextRank_data/seg' + str(video_seg_id) + '/'
    path_stage1 = subdir + "stage1.json"
    path_stage2 = subdir + "stage2_key_ph.json"
    path_stage3 = subdir + "stage3_imp_sent.json"

    """Perform statistical parsing/tagging on a document in JSON format"""
    parse_book_seg = pytextrank.parse_doc([book_segment])
    with open(path_stage1, 'w') as f:
        for graf in parse_book_seg:
            f.write("%s\n" % pytextrank.pretty_print(graf._asdict()))

    graph, ranks = pytextrank.text_rank(path_stage1)
    """Collect and normalize the key phrases from a parsed document"""

    key_phrases = list(pytextrank.normalize_key_phrases(path_stage1, ranks))
    with open(path_stage2, 'w') as f:
        for rl in key_phrases:
            f.write("%s\n" % pytextrank.pretty_print(rl._asdict()))

    kernel = pytextrank.rank_kernel(path_stage2)
    """Calculate a significance weight for each sentence, 
    using MinHash to approximate a Jaccard distance from key phrases determined by TextRank"""
    key_sentences = list(pytextrank.top_sentences(kernel, path_stage1))
    with open(path_stage3, 'w') as f:
        for s in key_sentences:
            f.write(pytextrank.pretty_print(s._asdict()))
            f.write("\n")
    return key_sentences, key_phrases
Beispiel #17
0
def _get_keywords(path_stage0, path_stage2):
    # Stage 1: parse doc
    path_stage1 = 'o1.json'
    with open(path_stage1, 'w') as f:
        for graf in pytextrank.parse_doc(pytextrank.json_iter(path_stage0)):
            f.write("%s\n" % pytextrank.pretty_print(graf._asdict()))

    # Stage 2: rank words
    graph, ranks = pytextrank.text_rank(path_stage1)
    pytextrank.render_ranks(graph, ranks)

    result_dict = dict()
    with open(path_stage2, 'w') as f2:
        for rl in pytextrank.normalize_key_phrases(path_stage1, ranks):
            _ro = rl._asdict()
            ro = dict()
            ro[_ro['text']] = _ro['rank']
            #f2.write("%s\n" % pytextrank.pretty_print(ro))

            result_dict[_ro['text']] = _ro['rank']

    return result_dict
Beispiel #18
0
def retrieveSentences(content, word_limit):
    currpath = os.getcwd()
    folder = os.path.join(currpath, str(uuid.uuid4()))
    os.mkdir(folder)
    fname = str(uuid.uuid4())
    with open("{0}/{1}.json".format(folder, fname), "w") as f:
        f.write(json.dumps({"id": fname, "text": content}))
        f.close()
    path_stage0 = "{0}/{1}.json".format(folder, fname)
    path_stage1 = "{0}/o1.json".format(folder)
    with open(path_stage1, 'w') as f:
        for graf in pytextrank.parse_doc(pytextrank.json_iter(path_stage0)):
            f.write("%s\n" % pytextrank.pretty_print(graf._asdict()))
        f.close()
    path_stage2 = "{0}/o2.json".format(folder)
    graph, ranks = pytextrank.text_rank(path_stage1)
    #pytextrank.render_ranks(graph, ranks)
    with open(path_stage2, 'w') as f:
        for rl in pytextrank.normalize_key_phrases(path_stage1, ranks):
            f.write("%s\n" % pytextrank.pretty_print(rl._asdict()))
        f.close()
    kernel = pytextrank.rank_kernel(path_stage2)
    path_stage3 = "{0}/o3.json".format(folder)
    with open(path_stage3, 'w') as f:
        for s in pytextrank.top_sentences(kernel, path_stage1):
            f.write(pytextrank.pretty_print(s._asdict()))
            f.write("\n")
        f.close()
    sent_iter = sorted(pytextrank.limit_sentences(path_stage3,
                                                  word_limit=word_limit),
                       key=lambda x: x[1])
    s = []
    for sent_text, idx in sent_iter:
        s.append(pytextrank.make_sentence(sent_text))
    graf_text = " ".join(s)
    shutil.rmtree(folder)
    return s
Beispiel #19
0
def rank_bill(bill):
    bill_id = bill['bill_id']
    with open(prefix + '/{}_stage1'.format(bill_id), 'w') as f:
        for graf in parse_doc([bill]):
            f.write(pretty_print(graf._asdict()))
            f.write('\n')

    path_stage1 = prefix + '/{}_stage1'.format(bill_id)

    graph, ranks = text_rank(path_stage1)
    render_ranks(graph, ranks)

    for rl in normalize_key_phrases(path_stage1, ranks):
        output = pretty_print(rl._asdict())
        with open(prefix + '/{}_stage2'.format(bill_id), 'w') as f:
            f.write(output)

    path_stage1 = prefix + '/{}_stage1'.format(bill_id)
    path_stage2 = prefix + '/{}_stage2'.format(bill_id)

    kernel = rank_kernel(path_stage2)
    with open(prefix + '/{}_stage3'.format(bill_id), 'w') as f:
        for s in top_sentences(kernel, path_stage1):
            f.write(pretty_print(s._asdict()))
Beispiel #20
0
path_stage1 = "o1.json"

# Extract keyword using pytextrank
with open(path_stage1, 'w') as f:
    for graf in pytextrank.parse_doc(pytextrank.json_iter(path_stage0)):
        f.write("%s\n" % pytextrank.pretty_print(graf._asdict()))
        #print(pytextrank.pretty_print(graf._asdict()))

path_stage1 = "o1.json"
path_stage2 = "o2.json"

graph, ranks = pytextrank.text_rank(path_stage1)
pytextrank.render_ranks(graph, ranks)

with open(path_stage2, 'w') as f:
    for rl in pytextrank.normalize_key_phrases(path_stage1, ranks):
        f.write("%s\n" % pytextrank.pretty_print(rl._asdict()))
        #print(pytextrank.pretty_print(rl))

path_stage1 = "o1.json"
path_stage2 = "o2.json"
path_stage3 = "o3.json"

kernel = pytextrank.rank_kernel(path_stage2)

with open(path_stage3, 'w') as f:
    for s in pytextrank.top_sentences(kernel, path_stage1):
        f.write(pytextrank.pretty_print(s._asdict()))
        f.write("\n")
        # to view output in this notebook
        print(pytextrank.pretty_print(s._asdict()))
Beispiel #21
0
def insert_key_phrases_into_db(list_of_doc_dicts, doctype, collection):
    '''
    Takes in list of doc dictionaries and a doctype ('comment' or 'post'), 
    processes each doc with PyTextRank, obtains key phrases and 
    inserts key phrases into document in Mongodb as 'key_phrases' field.
    '''
    path_stage0 = 'stage0.json'
    path_stage1 = 'stage1.json'
    path_stage2 = 'stage2.json'
    path_stage3 = 'stage3.json'

    total_docs = len(list_of_doc_dicts)

    failed_ids = []
    for i, doc_dict in enumerate(list_of_doc_dicts):
        if i % 50 == 0:
            print(f'processing {i} of {total_docs} documents')
        doc_dict['text'] = doc_dict['text'].split('\n_____\n\n')[0]

        try:
            with open(path_stage0, 'w') as f:
                json.dump(doc_dict, f)
            # Stage 1
            with open(path_stage1, 'w') as f:
                for graf in pytextrank.parse_doc(
                        pytextrank.json_iter(path_stage0)):
                    f.write("%s\n" % pytextrank.pretty_print(graf._asdict()))
                    # print(pytextrank.pretty_print(graf))
            # Stage 2
            graph, ranks = pytextrank.text_rank(path_stage1)
            pytextrank.render_ranks(graph, ranks)
            with open(path_stage2, 'w') as f:
                for rl in pytextrank.normalize_key_phrases(path_stage1, ranks):
                    f.write("%s\n" % pytextrank.pretty_print(rl._asdict()))
                    # to view output in this notebook
                    # print(pytextrank.pretty_print(rl))
            # Stage 3
            kernel = pytextrank.rank_kernel(path_stage2)
            with open(path_stage3, 'w') as f:
                for s in pytextrank.top_sentences(kernel, path_stage1):
                    f.write(pytextrank.pretty_print(s._asdict()))
                    f.write("\n")
                    # to view output in this notebook
                    # print(pytextrank.pretty_print(s._asdict()))
            # Stage 4
            phrase_list = list(
                set([
                    p for p in pytextrank.limit_keyphrases(path_stage2,
                                                           phrase_limit=15)
                ]))
            phrases = ", ".join(phrase_list)

            sent_iter = sorted(pytextrank.limit_sentences(path_stage3,
                                                          word_limit=150),
                               key=lambda x: x[1])
            s = []

            for sent_text, idx in sent_iter:
                s.append(pytextrank.make_sentence(sent_text))

            graf_text = " ".join(s)
            collection.update_one({f'{doctype}_id': {
                '$eq': doc_dict['id']
            }}, {'$set': {
                'key_phrases': phrase_list
            }})
        except:
            failed_ids.append(doc_dict['id'])
            print('failed on ', doc_dict['id'])
            continue
Beispiel #22
0
def do_pytextrank(data):
    for item in data:
        for subItem in data[item]:
            print('###############')
            print('description:', subItem['description'])

            # using pytextrank
            # reference https://github.com/ceteri/pytextrank/issues/18

            # raw input
            subItemJSON = {'id': subItem['id'], 'text': subItem['description']}
            subItemJSON = json.dumps(subItemJSON)
            with open('sub_item.json', 'w') as outFile:
                outFile.write(subItemJSON)

            # stage 1
            with open('stage1_output.json', 'w') as outFile:
                for graf in pytextrank.parse_doc(
                        pytextrank.json_iter('sub_item.json')):
                    outFile.write("%s\n" %
                                  pytextrank.pretty_print(graf._asdict()))

            # stage 2
            graph, ranks = pytextrank.text_rank('stage1_output.json')
            pytextrank.render_ranks(graph, ranks)
            rlLists = []
            print('key phrases:')
            with open('stage2_output.json', 'w') as outFile:
                for rl in pytextrank.normalize_key_phrases(
                        'stage1_output.json', ranks):
                    rlList = eval(pytextrank.pretty_print(rl))
                    rlLists.append(rlList)
                    print(rlList)

            # cleanup
            os.system(
                'rm -f sub_item.json stage1_output.json stage2_output.json graph.dot'
            )

            # input filter results based on pos
            # this is a heuristic
            filteredRlLists = [x for x in rlLists if 'nn' not in x[-2]]
            if (len(filteredRlLists) == 0):
                # invalid case
                continue
            else:
                [heuristic, iOItem] = do_heuristic(subItem, filteredRlLists)
                print('heuristic:', heuristic)
                print('i/o input:', iOItem)

            # input filter results based on pos
            # this is a heuristic
            filteredRlLists = [x for x in rlLists if 'nn' in x[-2]]
            if (len(filteredRlLists) == 0):
                # invalid case
                continue
            else:
                [heuristic, iOItem] = do_heuristic(subItem, filteredRlLists)
                print('heuristic:', heuristic)
                print('i/o output:', iOItem)

            print('###############')
Beispiel #23
0
with open(path_stage1, 'w') as f:
    grafs = pytextrank.parse_doc(pytextrank.json_iter(path_stage0))
    for graf in grafs:
        f.write("%s\n" % pytextrank.pretty_print(graf._asdict()))
        # to view output in this notebook
        # print(pytextrank.pretty_print(graf))

# path_stage1 = path_dir + "o1.json"
path_stage2 = path_dir + "o2.json"

graph, ranks = pytextrank.text_rank(grafs)
pytextrank.render_ranks(graph, ranks)

with open(path_stage2, 'w') as f:
    for rl in pytextrank.normalize_key_phrases(grafs, ranks):
        f.write("%s\n" % pytextrank.pretty_print(rl._asdict()))
        # to view output in this notebook
        print(pytextrank.pretty_print(rl))

import networkx as nx
import pylab as plt

nx.draw(graph, with_labels=True)
plt.show()

path_stage1 = path_dir + "o1.json"
path_stage2 = path_dir + "o2.json"
path_stage3 = path_dir + "o3.json"

kernel = pytextrank.rank_kernel(path_stage2)
def extract_phrasesfrom_textrank(corpus):
    record_data = pd.DataFrame({'sentences': corpus})
    record_data = pd.DataFrame({
        'id': record_data.index.tolist(),
        'text': record_data['sentences'].tolist()
    })
    tweet_items = []
    for jdict in record_data.to_dict(orient='records'):
        tweet_items.append(jdict)

    new_df_tweet = pd.DataFrame(columns=['text', 'keywords'])
    path_stage1 = "celebrity1_tweet.json"
    path_stage2 = "celebrity2_tweet.json"
    path_stage3 = "celebrity3_tweet.json"
    for item in tweet_items:
        items_new = [item]
        with open(path_stage1, 'w') as f:
            for graf in pytextrank.parse_doc(items_new):
                f.write("%s\n" % pytextrank.pretty_print(graf._asdict()))

        graph, ranks = pytextrank.text_rank(path_stage1)
        pytextrank.render_ranks(graph, ranks)

        with open(path_stage2, 'w') as f:
            for rl in pytextrank.normalize_key_phrases(path_stage1, ranks):
                f.write("%s\n" % pytextrank.pretty_print(rl._asdict()))

        kernel = pytextrank.rank_kernel(path_stage2)

        with open(path_stage3, 'w') as f:
            for s in pytextrank.top_sentences(kernel, path_stage1):
                f.write(pytextrank.pretty_print(s._asdict()))
                f.write("\n")
        phrases = ", ".join(
            set([
                p for p in pytextrank.limit_keyphrases(path_stage2,
                                                       phrase_limit=5)
            ]))
        sent_iter = sorted(pytextrank.limit_sentences(path_stage3,
                                                      word_limit=150),
                           key=lambda x: x[1])
        s = []

        for sent_text, idx in sent_iter:
            s.append(pytextrank.make_sentence(sent_text))

        graf_text = " ".join(s)
        new_df_tweet = new_df_tweet.append(
            {
                'text': item.get('text'),
                'keywords': phrases
            }, ignore_index=True)

    celeb_list = [
        'Bradley Cooper', 'Chris Kyle', 'Clint Eastwood', 'bradley cooper',
        'bradley', 'cooper', 'chris kyle', 'chris', 'kyle', 'clint eastwood',
        'clint', 'eastwood'
    ]

    cleaned_df_tweet = pd.DataFrame(columns=['sentences', 'keywords'])
    for index, row in new_df_tweet.iterrows():
        if any(celeb in row['keywords'] for celeb in celeb_list):
            cleaned_df_tweet = cleaned_df_tweet.append(
                {
                    'sentences': row['text'],
                    'keywords': row['keywords']
                },
                ignore_index=True)

    cleaned_df_tweet.to_csv(phrase_filepath,
                            sep=',',
                            encoding='utf-8',
                            index=False)
    new_df_tweet.to_csv(all_phrasefile_path,
                        sep=',',
                        encoding='utf-8',
                        index=False)
    return new_df_tweet, cleaned_df_tweet
Beispiel #25
0
def keyPhrases():
    graph, ranks = pytextrank.text_rank('temp2.json')
    pytextrank.render_ranks(graph, ranks)
    with open('temp3.json', 'w') as f:
        for rl in pytextrank.normalize_key_phrases('temp2.json', ranks):
            f.write("%s\n" % pytextrank.pretty_print(rl._asdict()))
Beispiel #26
0
           'for components of a minimal set of solutions and algorithms of construction of ' + \
           'minimal generating sets of solutions for all types of systems are given. ' + \
           'These criteria and the corresponding algorithms for constructing a minimal ' + \
           'supporting set of solutions can be used in solving all the considered types ' + \
           'systems and systems of mixed types.'
someothertext = 'Amazon.com, Inc. is located in Seattle, WA and was founded July 5th, 1994 by Jeff Bezos, ' + \
    'allowing customers to buy everything from books to blenders. Seattle is north of Portland and ' + \
    'south of Vancouver, BC. Other notable Seattle - based companies are Starbucks and Boeing.'

docs = [{'text': sometext, 'id': 777}]

grafs = [{'graf': graf.graf} for graf in pytextrank.parse_doc(docs)]
graph, ranks = pytextrank.text_rank(grafs)
rank_list = [
    rl._asdict()
    for rl in pytextrank.normalize_key_phrases(grafs, ranks, skip_ner=False)
]
kernel = pytextrank.rank_kernel(rank_list)
sents = [s._asdict() for s in pytextrank.top_sentences(kernel, grafs)]
phrases = [
    p[0] for p in pytextrank.limit_keyphrases(rank_list, phrase_limit=6)
]

sent_iter = sorted(pytextrank.limit_sentences(sents, word_limit=150),
                   key=lambda x: x[1])
sents = [pytextrank.make_sentence(sent_text) for sent_text, idx in sent_iter]
graf_text = ' '.join(sents)

print("\n**excerpts:** %s\n\n**keywords:** %s" % (
    graf_text,
    phrases,
Beispiel #27
0
            # pytextrank.render_ranks(graph, ranks)

            #stage 2: normalize key phrases

            stage_2_directory = os.path.join(directory, "Stage2Results")
            stage_2_files = find_files(stage_2_directory, "*.Stage2")
            stage_2_filename = "{publisher}_{version}_textRank.normalizedKeyPhrases.".format(
                version=version, publisher=publisher)
            stage_2_out = os.path.join(directory, "Stage2Results",
                                       "agglomerated", stage_2_filename)
            rl_fake_json = []  # stage 2 output
            if not os.path.isfile(stage_2_out):
                counter = 0
                for rl in pytextrank.normalize_key_phrases(
                        fake_json_graph_dicts,
                        ranks,
                        stopwords=RAKE.SmartStopList()):
                    # print(pytextrank.pretty_print(rl))
                    rl_fake_json.append([rl._asdict()])
                    stage_2_rl_filename = "___{publisher}_{version}_textRank_{thread_num}_rl.Stage2.".format(
                        version=version,
                        thread_num=counter,
                        publisher=publisher)
                    stage_2_rl_out = os.path.join(directory, "Stage2Results",
                                                  stage_2_rl_filename)
                    pickle.dump([rl._asdict()], open(stage_2_rl_out, 'wb'))
                    counter += 1
                # stage_2_args = [[fake_json_graph_dict, ranks, i]
                #                 for i, fake_json_graph_dict in
                #                 zip([k for k in range(0, len(fake_json_graph_dicts))],
                #                     fake_json_graph_dicts)]
if os.path.exists(OUTOUT_DIRECTORY):
	shutil.rmtree(OUTOUT_DIRECTORY)
os.makedirs(OUTOUT_DIRECTORY)


print('Saving tweets to json...')
with open(TWEETS_JSON, 'w', encoding='utf8') as outfile:
	json.dump({'id': '777', 'text': '. '.join(user_tweets)}, outfile, ensure_ascii=False)
print('Saving tweets to json - Done')


print('Performing statistical parsing/tagging on tweets...')
with open(STATISTICAL_PARSING_OUTPUT, 'w') as f:
    for graf in pytextrank.parse_doc(pytextrank.json_iter(TWEETS_JSON)):
        f.write("%s\n" % pytextrank.pretty_print(graf._asdict()))
print('Performing statistical parsing/tagging on tweets... - Done')


print('Collect and normalizing the key phrases from the parsed document...')
graph, ranks = pytextrank.text_rank(STATISTICAL_PARSING_OUTPUT)
pytextrank.render_ranks(graph, ranks)
with open(KEY_PHRASES_NORMALIZATION_OUTPUT, 'w') as f:
    for rl in pytextrank.normalize_key_phrases(STATISTICAL_PARSING_OUTPUT, ranks):
        f.write("%s\n" % pytextrank.pretty_print(rl._asdict()))
print('Collect and normalizing the key phrases from the parsed document... - Done')


print("Summarizing tweets based on key phrases...")
phrases = ", ".join(set([p for p in pytextrank.limit_keyphrases(KEY_PHRASES_NORMALIZATION_OUTPUT, phrase_limit=MAX_SUBJECTS_TO_SHOW)]))
print("**Top-10 subjects:** %s" % phrases)
Beispiel #29
0
#!/usr/bin/env python
# encoding: utf-8

from pytextrank import normalize_key_phrases, pretty_print, render_ranks, text_rank
import sys

## Stage 2:
##  * collect and normalize the key phrases from a parsed document
##
## INPUTS: <stage1>
## OUTPUT: JSON format `RankedLexeme(text, rank, ids, pos)`

if __name__ == "__main__":
    path_stage1 = sys.argv[1]

    graph, ranks = text_rank(path_stage1)
    render_ranks(graph, ranks)

    for rl in normalize_key_phrases(path_stage1, ranks):
        # print (rl)
        print(pretty_print(rl._asdict()))