Beispiel #1
0
def main():
    """
	Executes the entire pipeline of the code
	:return: void
	"""
    gt = getGroundTruth()
    model_sum, gt_sum = [], []
    print("Fetching encoder model...", end=" ")
    enc_model = SentenceTransformer('bert-base-nli-mean-tokens')
    print("Done")
    for full_text, catch_phrases in gt[:20]:
        # Embed each sentence
        sentence_embeddings = enc_model.encode(full_text)
        # Cluster each embedding
        cluster_n = 11
        clusters = cluster(sentence_embeddings, minimum_samples=cluster_n)
        centroids = []
        for idx in range(cluster_n):
            centroid_id = np.where(clusters.labels_ == idx)[0]
            centroids.append(np.mean(centroid_id))

        # Select representative cluster
        closest, _ = pairwise_distances_argmin_min(clusters.cluster_centers_,
                                                   sentence_embeddings)
        ordering = sorted(range(cluster_n), key=lambda k: centroids[k])

        summary = '.'.join([full_text[closest[idx]]
                            for idx in ordering]).replace('\n', ' ')
        model_sum.append(summary)
        gt_sum.append(".".join(catch_phrases))
    print("ROUGE score: {}".format(evaluate(model_sum, gt_sum)))
Beispiel #2
0
def main():
	"""
	Executes the entire pipeline of the code
	:return: void
	"""
	gt = getGroundTruth()
	model_sum, gt_sum = [], []
	doc_n = len(gt)
	nb_dataset = []
	for full_text, catch_phrases in gt[:500]:
		texts = [re.sub(r'^(\d+) (.*)', r'\2', text) for text in full_text]
		legal_text = " ".join(texts)
		legal_text = generateParagraphs(legal_text)
		legal_class = catch_phrases[-1]
		nb_dataset.append((legal_text, catch_phrases[-1],))
	class_model, cv, legal_classes = nbTrain(nb_dataset)
	for full_text, catch_phrases in gt[:20]:
		texts = [re.sub(r'^(\d+) (.*)', r'\2', text) for text in full_text]
		legal_text = " ".join(texts)
		legal_text = generateParagraphs(legal_text)
		legal_text = cleanText(legal_text)
		text_cv = cv.transform([legal_text])
		legal_class = class_model.predict(text_cv)
		gt_legal_class = catch_phrases[-1]
		print(legal_class[0], gt_legal_class, legal_class[0] == legal_classes[gt_legal_class])
Beispiel #3
0
def parseText():
	"""
	Returns the headings of whole text
	:return: void
	"""
	gt = getGroundTruth()
	for full_text, catch_phrases in gt[:100]:
		paragraphs, headings = generateParagraph(full_text)
		for heading in headings[:3]:
			print(heading)
		print("="*20)
Beispiel #4
0
    def train(self):
        """
		Trains a classifier for the legal text
		:return:
		"""
        gt = getGroundTruth()
        nb_dataset = []
        for full_text, catch_phrases in gt[:500]:
            legal_text = self.preprocess(full_text)
            nb_dataset.append((
                legal_text,
                catch_phrases[-1],
            ))
        self.nbTrain(nb_dataset)
Beispiel #5
0
    def main(self):
        """
		Executes the entire pipeline of the code
		:return: void
		"""
        gt = getGroundTruth()
        model_sum, gt_sum = [], []
        doc_n = len(gt)
        for doc_idx in range(20):
            print("{}/{}".format(doc_idx, doc_n))
            full_text, catch_phrases = gt[doc_idx]
            summary = self.getSentenceSummary(full_text)
            model_sum.append(summary)
            gt_sum.append(".".join(catch_phrases))
        print("ROUGE score: {}".format(self.evaluate(model_sum, gt_sum)))
Beispiel #6
0
    def getConclusion(self):
        """
		Returns the last catch phrase of every doc
		:return: void
		"""
        gt = getGroundTruth()
        conclusion_freq = {}
        for full_text, catch_phrases in gt[:500]:
            conclusion = catch_phrases[-1]
            if conclusion not in conclusion_freq:
                conclusion_freq[conclusion] = 0
            conclusion_freq[conclusion] += 1
        conclusions = [(word, freq) for word, freq in conclusion_freq.items()]
        conclusions.sort(key=lambda x: x[1], reverse=True)
        for conclusion, _ in conclusions:
            print(conclusion)
Beispiel #7
0
    def getIntroductions(self):
        """
		Returns the first catch phrase of every doc
		:return: void
		"""
        gt = getGroundTruth()
        intro_word_freq = {}
        for full_text, catch_phrases in gt[:500]:
            intro_words = catch_phrases[0].split(" ")
            for word in intro_words:
                if word not in self.stop_words:
                    if word not in intro_word_freq:
                        intro_word_freq[word] = 0
                    intro_word_freq[word] += 1
        intro_words = [(word, freq) for word, freq in intro_word_freq.items()]
        intro_words.sort(key=lambda x: x[1], reverse=True)
        print(intro_words)
Beispiel #8
0
    def getHeadings(self):
        """
		Returns the headings of whole text
		:return: void
		"""
        gt = getGroundTruth()
        pattern = re.compile(r'.+(\n )+\n.+')
        for full_text, catch_phrases in gt[:1]:
            print("".join(full_text))
            headings = []
            for sent in full_text:
                if pattern.search(sent) is not None:
                    sent = re.sub(r'(\n( )*)+\n', r'\n', sent)
                    headings.append(sent)
            print(len(headings))
            for heading in headings:
                print("============================")
                print(heading)
Beispiel #9
0
def parseText():
    """
    Returns the headings of whole text
    :return: void
    """
    gt = getGroundTruth()
    paraSegmentsFinal = []
    for full_text, catch_phrases in gt[:100]:
        #print(full_text)
        paragraphs, headings, paragraphsUnderHeading = generateParagraph(
            full_text)
        for heading in headings:
            break
            #print(heading)
        #print(len(paragraphs), len(headings), len(paragraphsUnderHeading))
        paraSegments = thematicSegmentation(paragraphs, headings,
                                            paragraphsUnderHeading)
        paraSegmentsFinal.append(paraSegments)
    return paraSegmentsFinal
Beispiel #10
0
def main():
    """
    Executes the entire pipeline of the code
    :return: void
    """
    gt = getGroundTruth()
    model_sum, gt_sum = [], []
    #print("Fetching encoder model...", end=" ")
    #enc_model = SentenceTransformer('bert-base-nli-mean-tokens')
    model = skipthoughts.load_model()
    encoder = skipthoughts.Encoder(model)
    #print("Done")
    for full_text, catch_phrases in gt:
        # Embed each sentence
        #sentence_embeddings = enc_model.encode(full_text)
        encoded = encoder.encode(full_text)
        # Cluster each embedding
        cluster_n = 11
        #clusters = cluster(sentence_embeddings, minimum_samples=cluster_n)
        clusters = cluster(encoded, minimum_samples=cluster_n)
        centroids = []
        for idx in range(cluster_n):
            centroid_id = np.where(clusters.labels_ == idx)[0]
            centroids.append(np.mean(centroid_id))

        # Select representative cluster
        closest, _ = pairwise_distances_argmin_min(clusters.cluster_centers_,
                                                   encoded)
        ordering = sorted(range(cluster_n), key=lambda k: centroids[k])
        print(ordering)
        summary = ' '.join([full_text[closest[idx]]
                            for idx in ordering]).replace('\n', ' ')
        model_sum.append(summary)
        print([(full_text[closest[idx]], closest[idx]) for idx in ordering])
        print(summary)
        print(len(catch_phrases))
        print(".".join(catch_phrases))
        gt_sum.append(".".join(catch_phrases))
        break