def _get_cluster_summaries(self, comments_clusters):
        clusters_with_summaries = dict()
        for i in range(self.num_of_clusters):
            if i not in clusters_with_summaries.keys():
                clusters_with_summaries[i] = {'post_ids': [],
                                              'summary': self.ERROR_NOT_ENOUGH_POSTS_FOR_SUMMARY,
                                              'keywords': [self.ERROR_NOT_ENOUGH_POSTS_FOR_TAGS]}

        for text_id, cluster_id in comments_clusters:
            clusters_with_summaries[cluster_id]['post_ids'].append(text_id)


        for cluster_id in clusters_with_summaries.keys():
            num_of_cluster_sentences = len(clusters_with_summaries[cluster_id]['post_ids'])
            if num_of_cluster_sentences < 10:
                continue

            random_text_from_cluster = '. '.join(self.corpus.texts[np.random.choice(num_of_cluster_sentences, 10)])

            clusters_with_summaries[cluster_id]['summary'] = summarize(random_text_from_cluster, word_count=150)


        cluster_summaries = []
        for i in range(self.num_of_clusters):
            cluster_summaries.append((i, clusters_with_summaries[i]['summary']))

        return cluster_summaries
    def test_text_summarization_on_short_input_text_is_empty_string(self):
        text = self._get_text_from_test_data("testsummarization_unrelated.txt")

        # Keeps the first 8 sentences to make the text shorter.
        text = "\n".join(text.split('\n')[:8])

        self.assertNotEqual(summarize(text), u"")
def generate_summary():
  global reference_summary_list, system_summary_list
  training = ET.parse('../data/training.xml')

  files_read = 0

  reference_summary_list = []
  system_summary_list = []
  for i, t in enumerate(training.findall("article")):
    article_id = t.get('id')
    reference_summary = t.find("summary").text
    text = t.find("text").text
    try:
      system_summary = summarize(text, word_count = 20)
    except (ValueError, ZeroDivisionError):
      continue 

    if(system_summary == None or len(system_summary) > 140 or len(system_summary) == 0 ):
      continue

    # write reference summary to file
    ref_dir = os.pardir + "/test-summarization/reference/" + article_id + "_" + "reference.txt"
    sys_dir = os.pardir + "/test-summarization/system/" + article_id + "_" + "system.txt"
    write_to_file(ref_dir, reference_summary)
    reference_summary_list.append([ref_dir])

    # write system summary to file
    write_to_file(sys_dir, system_summary)
    system_summary_list.append(sys_dir)

    if (files_read > 10):
      break

    files_read = files_read + 1
Exemple #4
0
def add(request):
    logger.info('add metric='+str(request.POST))
    #logger.info('text='+str(request.data.get('text','')))
    text = request.POST.get('text','')
    #msgType = None
    #msgType = request.META.get('HTTP_x_amz_sns_message_type')
    #if msgType == 'Notification':
    #   #add metric
    #   pass
    #from textselector import test_build_counts, test_find_closest
    #text = get_text()
    #counts = test_build_counts(text)
    #closest = test_find_closest(text, counts)
    #closest = text.splitlines()[0:1]
    #summary = '\n'.join(closest) 
    text = text.split('.')
    text = '\n'.join(text)
    try:
       summary = summarize(text)
    except Exception as e:
       summary = str(e)
       if type(e).__name__ == "TypeError":
          summary = ''.join(text.splitlines()[0:1])
    logger.info('summary='+repr(summary))
    return HttpResponse(json.dumps({'status':'success',
              'msg':'added',
              'summary':summary
              }))
def gensim_topic ():
    text = "Saturday was a lot of journeying to get to the PangalaneLakes.  We took 2 taxi-brousse and then acar met us at the junction to Manambato to take us to our hotel’s boat.  The journey was very windy and sickening aswe travelled through the mountains and towards the coast.  We were met off the boat with a cocktail andsome snacks, a great start to a relaxing few days.The hotel lay on a strip of white sand between two lakes.  This meant you could watch both the sunriseand the sunset above the water.  Westayed in little bungalows along the beach and felt like we’d hired a privateisland as there were no other guests around. Matt went for a first dip in one of the lakes and emerged unscathed which was lucky considering we later found out that crocodiles had been spotted there.  The rest of us went swimming in the other lake which was safe!  Our versatile deckchairs helped with beach volleyball and football as well as reading in the sun.  In the evenings we watched the stars and drank cocktails by the fire we made.  Heavenly!There were two cheeky bamboo lemurs living on the strip thatoccasionally came around meal times to steal anything we’d left and they weren’tafraid of humans in the slightest.  After our chilled couple of days we hired a boat to take usto Tamatave along the lakes.  We stoppedfor a beautiful picnic on the beach half way through the journey and also gotto see one of the local villages when we had to stop to ask if I could usetheir loo!  Lots of men from the villageswere out fishing in their pirogues, made from hollowed out tree trunks."

    sentences = text.split(".")
    tokens = [sentence.split() for sentence in sentences]
    dictionary = corpora.Dictionary(tokens)
    corpus = [dictionary.doc2bow(sentence_tokens) for sentence_tokens in tokens]

    print(summarization.summarize(text))
    def test_text_summarization(self):
        text = self._get_text_from_test_data("mihalcea_tarau.txt")

        # Makes a summary of the text.
        generated_summary = summarize(text)

        # To be compared to the method reference.
        summary = self._get_text_from_test_data("mihalcea_tarau.summ.txt")

        self.assertEqual(generated_summary, summary)
    def test_text_summarization_raises_exception_on_short_input_text(self):
        pre_path = os.path.join(os.path.dirname(__file__), 'test_data')

        with utils.smart_open(os.path.join(pre_path, "testsummarization_unrelated.txt"), mode="r") as f:
            text = f.read()

        # Keeps the first 8 sentences to make the text shorter.
        text = "\n".join(text.split('\n')[:8])

        self.assertTrue(summarize(text) is not None)
    def test_summary_from_unrelated_sentences(self):
        # Tests that the summarization of a text with unrelated sentences does not raise an exception.
        pre_path = os.path.join(os.path.dirname(__file__), 'test_data')

        with utils.smart_open(os.path.join(pre_path, "testsummarization_unrelated.txt"), mode="r") as f:
            text = f.read()

        generated_summary = summarize(text)

        self.assertNotEqual(generated_summary, None)
def summa(fname):
	try:
		with open(fname, 'r') as myfile:
			data=myfile.read()#.replace('\n', '.')
			sum1= summarize(data)
			#sum2=summarize(sum1)
			#sum3=summarize(sum2)
	except:
			print ""
			sum1=""
	return sum1
def summa(fname):
	#try:
		with open(path+fname+".txt", 'r') as myfile:
			data=myfile.read()#.replace('\n', '.')
			sum1= summarize(data)
			#sum2=summarize(sum1)
			#sum3=summarize(sum2)
	#except:
			#print ""
			#sum1=""
		return sum1
    def test_text_summarization(self):
        pre_path = os.path.join(os.path.dirname(__file__), 'test_data')

        with utils.smart_open(os.path.join(pre_path, "mihalcea_tarau.txt"), mode="r") as f:
            text = f.read()

        # Makes a summary of the text.
        generated_summary = summarize(text)

        # To be compared to the method reference.
        with utils.smart_open(os.path.join(pre_path, "mihalcea_tarau.summ.txt"), mode="r") as f:
            summary = f.read()

        self.assertEqual(generated_summary, summary)
Exemple #12
0
    def __iter__(self):
        with self.__db_object as conn:
            cursor = conn.cursor()
            cursor.execute(self.__dbSQL.sql)
            record = cursor.fetchone()
            while record:
                document_text = record[self.__dbSQL.doc_location]
                pk, title = record[self.__dbSQL.unique_key_location], record[self.__dbSQL.title_location]

                if self.preprocessor:
                    title = self.preprocessor.clean_text(title)
                    document_text = self.preprocessor.clean_text(document_text)

                self.record_identifiers.append((pk, title))
                yield summarization.summarize(document_text)
                record = cursor.fetchone()
def upload(request):
    name = request.POST.get('name','')
    import urllib.parse
    name = urllib.parse.unquote(name)
    text = [""]
    path = os.path.dirname(os.path.abspath(__file__))+"/"+"../../../../ui/summarizer/public/uploads/"
    try:
       h = ''
       if name:
          print('opening file='+path+name)
          h = textract.process(path+name)
       if h:
          text = [x.strip() for x in h.decode('utf-8').splitlines()]
          text = '\n'.join(text)
          #print(text)
       else:
          with open(path+"/"+name, 'r') as fin:
               text = fin.readlines()
               text = '\n'.join(text)
       summary = ""
       if text:
          #logger.info('text='+repr(text))
          summary = summarize(text)
       if summary:
          print("summarized:"+name)
          pass
       else:
          summary = ''.join(text.splitlines()[0:1])
    except Exception as e:
       summary = str(e)
       if type(e).__name__ == "TypeError":
          summary = ''.join(text.splitlines()[0:1])
    #logger.info('summary='+repr(summary))
    return HttpResponse(json.dumps({'status':'success',
              'msg':'added',
              'summary':summary
              }))
Exemple #14
0
def articleGensim(file):

    # Open the article
    with open('../validArticles/' + file, 'r') as f:
        soup = BeautifulSoup(f.read(), 'html.parser')

    # Check if it has a top highlight
    highlight = soup.find('span', {'data-creator-ids': 'anon'})

    # Clean up highlight and remove bad unicode
    highlightClean = unidecode.unidecode(" ".join(item.strip() for item in highlight.find_all(text=True)))

    # Go to class containing article
    article = soup.find('main', {'class': 'postArticle-content js-postField js-notesSource'})

    # Strip out special tag that adds bad information
    [x.extract() for x in article.find_all('noscript')]
    [x.extract() for x in article.find_all('h1')]
    [x.extract() for x in article.find_all('h2')]
    [x.extract() for x in article.find_all('h3')]
    [x.extract() for x in article.find_all('h4')]
    [x.extract() for x in article.find_all('h5')]
    [x.extract() for x in article.find_all('h6')]

    articleClean = unidecode.unidecode(" ".join(item.strip() for item in article.find_all(text=True)))

    # Summarize article using TextRank
    summary = summarize(articleClean, word_count=100)

    # Remove formatting so format anomalies don't affect results
    highlightNoFormat = re.sub("[^a-zA-Z]", "", highlightClean)
    summaryNoFormat = re.sub("[^a-zA-Z]", "", summary)

    if len(longest_common_substring(summaryNoFormat, highlightNoFormat)) > min(len(summaryNoFormat), len(highlightNoFormat)) - 10:
        return 1
    else:
        return 0
Exemple #15
0
#
# This recipe uses automatic computer science Paper generation tool from mit.edu
# You can generate your own paper by visiting https://pdos.csail.mit.edu/archive/scigen/
# and click generate.
#
# This example needs large amount of text that needs to be available for summary.
# So, we are using this paper generation tool and extracting the 'Introduction' section
# to do the summary analysis.
#

urls = {
    'Daff: Unproven Unification of Suffix Trees and Redundancy':
    'http://scigen.csail.mit.edu/scicache/610/scimakelatex.21945.none.html',
    'CausticIslet: Exploration of Rasterization':
    'http://scigen.csail.mit.edu/scicache/790/scimakelatex.1499.none.html'
}

for key in urls.keys():
    url = urls[key]
    r = requests.get(url)
    soup = BeautifulSoup(r.text, 'html.parser')
    data = soup.get_text()
    pos1 = data.find("1  Introduction") + len("1  Introduction")
    pos2 = data.find("2  Related Work")
    text = data[pos1:pos2].strip()
    print("PAPER URL: {}".format(url))
    print("TITLE: {}".format(key))
    print("GENERATED SUMMARY: {}".format(summarize(text)))
    print()
def helper(string):
    from gensim.summarization import summarize
    return summarize(string, split=True)
# Chap01/demo_gensim.py
from gensim.summarization import summarize
import sys

fname = sys.argv[1]

with open(fname, 'r') as f:
    content = f.read()
    summary = summarize(content, split=True, word_count=100)
    for i, sentence in enumerate(summary):
        print("%d) %s" % (i+1, sentence))
Exemple #18
0
 def test_summary_from_unrelated_sentences(self):
     # Tests that the summarization of a text with unrelated sentences is not empty string.
     text = self._get_text_from_test_data("testsummarization_unrelated.txt")
     generated_summary = summarize(text)
     self.assertNotEqual(generated_summary, u"")
Exemple #19
0
def my_form_post():
    original_text = request.form['text']
    summary=summarize(original_text, ratio=0.1, word_count=100)
    summary = summary.replace('\n','')
    return render_template('my-form.html',original_text = original_text,summary_text = summary)
def text_summarization_gensim(text, summary_ratio=0.5):
    #f=open("tmp.txt",'a')
    summary = summarize(text, split=True, ratio=summary_ratio)
    for sentence in summary:
        print(sentence)
Exemple #21
0
 def test_low_distinct_words_summarization_with_split_is_empty_list(self):
     text = self._get_text_from_test_data("testlowdistinctwords.txt")
     self.assertEqual(summarize(text, split=True), [])
Exemple #22
0
def analyze_text(text):
    ret = {}
    # language identification
    language = settings.LANG_ID.classify(text)[0]
    lang = settings.LANGUAGE_MODELS[language]
    ret = {}
    doc = lang(text)
    ret['language'] = settings.LANGUAGE_MAPPING[language]
    # analyzed text containing lemmas, pos and dep. Entities are coloured
    analyzed_text = ''
    for token in doc:
        if token.ent_type_:
            analyzed_text += '<span class="tooltip" data-content="POS: {0}<br> LEMMA: {1}<br> DEP: {2}" style="color: red;" >{3} </span>'.format(
                token.pos_, token.lemma_, token.dep_, token.text)
        else:
            analyzed_text += '<span class="tooltip" data-content="POS: {0}<br> LEMMA: {1}<br> DEP: {2}" >{3} </span>'.format(
                token.pos_, token.lemma_, token.dep_, token.text)

    ret['text'] = analyzed_text

    # Text category. Only valid for Greek text for now
    if language == 'el':
        ret.update(sentiment_analysis(doc))
        try:
            ret['category'] = predict_category(text, language)
        except Exception:
            pass

    try:
        ret['summary'] = summarize(text)
    except ValueError:  # why does it break in short sentences?
        ret['summary'] = ''

    # top 10 most frequent keywords, based on tokens lemmatization
    frequency = defaultdict(int)
    lexical_attrs = {
        'urls': [],
        'emails': [],
        'nums': [],
    }
    for token in doc:
        if (token.like_url):
            lexical_attrs['urls'].append(token.text)
        if (token.like_email):
            lexical_attrs['emails'].append(token.text)
        if (token.like_num or token.is_digit):
            lexical_attrs['nums'].append(token.text)
        if not token.is_stop and token.pos_ in [
                'VERB', 'ADJ', 'NOUN', 'ADV', 'AUX', 'PROPN'
        ]:
            frequency[token.lemma_] += 1
    keywords = [
        keyword for keyword, frequency in sorted(
            frequency.items(), key=lambda k_v: k_v[1], reverse=True)
    ][:10]
    ret['keywords'] = ', '.join(keywords)

    # Named Entities
    entities = {label: [] for key, label in ENTITIES_MAPPING.items()}
    for ent in doc.ents:
        # noticed that these are found some times
        if ent.text.strip() not in ['\n', '', ' ', '.', ',', '-', '–', '_']:
            mapped_entity = ENTITIES_MAPPING.get(ent.label_)
            if mapped_entity and ent.text not in entities[mapped_entity]:
                entities[mapped_entity].append(ent.text)
    ret['named_entities'] = entities

    # Sentences splitting
    ret['sentences'] = [sentence.text for sentence in doc.sents]

    # Lemmatized sentences splitting
    ret['lemmatized_sentences'] = [sentence.lemma_ for sentence in doc.sents]

    # Text tokenization
    ret['text_tokenized'] = [token.text for token in doc]

    # Parts of Speech
    part_of_speech = {label: [] for key, label in POS_MAPPING.items()}

    for token in doc:
        mapped_token = POS_MAPPING.get(token.pos_)
        if mapped_token and token.text not in part_of_speech[mapped_token]:
            part_of_speech[mapped_token].append(token.text)
    ret['part_of_speech'] = part_of_speech
    ret['lexical_attrs'] = lexical_attrs
    ret['noun_chunks'] = [
        re.sub(r'[^\w\s]', '', x.text) for x in doc.noun_chunks
    ]
    return ret
tree = util.find_files('/home/hobson/Downloads/evernote_notes/', ext='.txt')
print(len(tree))
tree[0]


# # `summarize`
# 
# <img src="summarization_tutorial_plot.png">

# In[53]:

for meta in tree:
    if meta['name'].lower().startswith('stuff'):
        continue
    print('')
    print('-'*80)
    print('### ' + os.path.splitext(meta['name'])[0])
    text = os.path.splitext(meta['name'])[0] + '. \n' + open(meta['path']).read()
    try:
        summary = summarize(text, word_count=50)
    except ValueError:
        print('TOO SHORT to SUMMARIZE')
        summary = text[:200]
    try:
        print(' '.join(pair[0] for pair in wv.most_similar(positive=[w for w in summary.split() if w in wv.vocab])[1:5] if '%' not in pair[0])) 
    except ValueError:
        print('MISMATCHED VOCAB')
    print(summary)
    

Exemple #24
0
 def test_low_distinct_words_summarization_is_empty_string(self):
     text = self._get_text_from_test_data("testlowdistinctwords.txt")
     self.assertEquals(summarize(text), u"")
Exemple #25
0
def serve_summary():
    try:
        summary = summarize(request.get_json().get('text'))
    except ValueError:
        summary = request.get_json().get('text')
    return jsonify({'res': summary})
Exemple #26
0
 def test_empty_text_summarization_with_split_is_empty_list(self):
     self.assertEquals(summarize("", split=True), [])
Exemple #27
0
 def test_empty_text_summarization_is_empty_string(self):
     self.assertEquals(summarize(""), u"")
 def summarize_text(self, text):
     return summarize(text, word_count=150)
def process():
    sentence = Entry.get(E1)
    ans = summarize(sentence)
    Entry.insert(E4, 0, ans)
    print(ans)
Exemple #30
0
 def test_empty_text_summarization_with_split_is_empty_list(self):
     self.assertEqual(summarize("", split=True), [])
Exemple #31
0
 def test_low_distinct_words_summarization_with_split_is_empty_list(self):
     text = self._get_text_from_test_data("testlowdistinctwords.txt")
     self.assertEquals(summarize(text, split=True), [])
 def test_empty_text_summarization_none(self):
     self.assertTrue(summarize("") is None)
Exemple #33
0
#"""
from gensim.summarization import summarize
text = "Thomas A. Anderson is a man living two lives. By day he is an " + \
    "average computer programmer and by night a hacker known as " + \
    "Neo. Neo has always questioned his reality, but the truth is " + \
    "far beyond his imagination. Neo finds himself targeted by the " + \
    "police when he is contacted by Morpheus, a legendary computer " + \
    "hacker branded a terrorist by the government. Morpheus awakens " + \
    "Neo to the real world, a ravaged wasteland where most of " + \
    "humanity have been captured by a race of machines that live " + \
    "off of the humans' body heat and electrochemical energy and " + \
    "who imprison their minds within an artificial reality known as " + \
    "the Matrix. As a rebel against the machines, Neo must return to " + \
    "the Matrix and confront the agents: super-powerful computer " + \
    "programs devoted to snuffing out Neo and the entire human " + \
    "rebellion. "

text = summarize(text)
print(text)
#"""
Exemple #34
0
 def test_summary_from_unrelated_sentences(self):
     # Tests that the summarization of a text with unrelated sentences is not empty string.
     text = self._get_text_from_test_data("testsummarization_unrelated.txt")
     generated_summary = summarize(text)
     self.assertNotEqual(generated_summary, u"")
def page_rank():
  global reference_summary_list, system_summary_list

  text = "Thomas A. Anderson is a man living two lives. By day he is an " + \
  "average computer programmer and by night a hacker known as " + \
  "Neo. Neo has always questioned his reality, but the truth is " + \
  "far beyond his imagination. Neo finds himself targeted by the " + \
  "police when he is contacted by Morpheus, a legendary computer " + \
  "hacker branded a terrorist by the government. Morpheus awakens " + \
  "Neo to the real world, a ravaged wasteland where most of " + \
  "humanity have been captured by a race of machines that live " + \
  "off of the humans' body heat and electrochemical energy and " + \
  "who imprison their minds within an artificial reality known as " + \
  "the Matrix. As a rebel against the machines, Neo must return to " + \
  "the Matrix and confront the agents: super-powerful computer " + \
  "programs devoted to snuffing out Neo and the entire human " + \
  "rebellion. "

  [dictionary, proc_text, sentences] = save_word_dict(text)

  raw_corpus = [dictionary.doc2bow(t) for t in proc_text]

  tfidf = models.TfidfModel(raw_corpus)
  
  # print tfidf
  corpus_tfidf = tfidf[raw_corpus]

  # print dictionary
  # load from   
  simMatrix= similarities.MatrixSimilarity(tfidf[raw_corpus])
  simMat = simMatrix[corpus_tfidf]

  s_len = len(proc_text)
  ranks = [1.0] * s_len
  old_ranks = [0.1] * s_len

  W = {}
  for i, s1 in enumerate(proc_text):
    for j, s2 in enumerate(s1):
      W[(i,j)] = s2

  damping_factor = 0.5
  while not converged(ranks, old_ranks):
    old_ranks = ranks
    for i in xrange(s_len):
      summation = 0.0
      for j in xrange(s_len):
        if j == i:
          continue
        if j < i:
          W[(j,i)] = simMat[j][i]
        else:
          W[(j,i)] = simMat[i][j]
        prj = ranks[j]
        densum = 0.0
        for k in xrange(s_len):
          if k == j:
            continue
          if j < k:
            W[(j,k)] = simMat[j][k]
          else:
            W[(j,k)] = simMat[k][j]
          densum = densum + W[(j,k)]
        summation = summation + W[(j,i)] * (prj/densum)
        ranks[i] = (1-damping_factor) + (damping_factor * summation)

  rankings = sorted(range(len(ranks)), key=lambda k: ranks[k])
  rankings.reverse()
  print rankings

  limit = 3
  system_summary = ""
  for i,index in enumerate(rankings):
    if i > limit:
      break
    system_summary = system_summary + " " + str(sentences[index])


  reference_summary = summarize(text, word_count=100)
  article_id = "test" # this should be filename id
  
  # write reference summary to file
  ref_dir = os.pardir + "/test-summarization/reference/" + article_id + "_" + "reference.txt"
  sys_dir = os.pardir + "/test-summarization/system/" + article_id + "_" + "system.txt"
  write_to_file(ref_dir, reference_summary)
  reference_summary_list.append([ref_dir])

  # write system summary to file
  write_to_file(sys_dir, system_summary)
  system_summary_list.append(sys_dir)
  test_print(reference_summary, system_summary)
Exemple #36
0
def res(jobfile):
    Resume_Vector = []
    Ordered_list_Resume = []
    Ordered_list_Resume_Score = []
    LIST_OF_FILES = []
    LIST_OF_FILES_PDF = []
    LIST_OF_FILES_DOC = []
    LIST_OF_FILES_DOCX = []
    Resumes = []
    Temp_pdf = []
    os.chdir('./Resumes')
    for file in glob.glob('**/*.pdf', recursive=True):
        LIST_OF_FILES_PDF.append(file)
    for file in glob.glob('**/*.doc', recursive=True):
        LIST_OF_FILES_DOC.append(file)
    for file in glob.glob('**/*.docx', recursive=True):
        LIST_OF_FILES_DOCX.append(file)

    LIST_OF_FILES = LIST_OF_FILES_DOC + LIST_OF_FILES_DOCX + LIST_OF_FILES_PDF
    print("This is LIST OF FILES")
    print(LIST_OF_FILES)

    print("####### PARSING ########")
    for nooo,i in enumerate(LIST_OF_FILES):
        Ordered_list_Resume.append(i)
        Temp = i.split(".")
        if Temp[1] == "pdf" or Temp[1] == "Pdf" or Temp[1] == "PDF":
            try:
                print("This is PDF" , nooo)
                with open(i,'rb') as pdf_file:
                    read_pdf = PyPDF2.PdfFileReader(pdf_file)
              
                    number_of_pages = read_pdf.getNumPages()
                    for page_number in range(number_of_pages): 

                        page = read_pdf.getPage(page_number)
                        page_content = page.extractText()
                        page_content = page_content.replace('\n', ' ')
                        Temp_pdf = str(Temp_pdf) + str(page_content)
                    Resumes.extend([Temp_pdf])
                    Temp_pdf = ''
                  
            except Exception as e: print(e)
        if Temp[1] == "doc" or Temp[1] == "Doc" or Temp[1] == "DOC":
            print("This is DOC" , i)
                
            try:
                a = textract.process(i)
                a = a.replace(b'\n',  b' ')
                a = a.replace(b'\r',  b' ')
                b = str(a)
                c = [b]
                Resumes.extend(c)
            except Exception as e: print(e)
                
                
        if Temp[1] == "docx" or Temp[1] == "Docx" or Temp[1] == "DOCX":
            print("This is DOCX" , i)
            try:
                a = textract.process(i)
                a = a.replace(b'\n',  b' ')
                a = a.replace(b'\r',  b' ')
                b = str(a)
                c = [b]
                Resumes.extend(c)
            except Exception as e: print(e)
                    
                
        if Temp[1] == "ex" or Temp[1] == "Exe" or Temp[1] == "EXE":
            print("This is EXE" , i)
            pass



    print("Done Parsing.")



    Job_Desc = 0
    LIST_OF_TXT_FILES = []
    os.chdir('../Job_Description')
    f = open(jobfile , 'r')
    text = f.read()
        
    try:
        tttt = str(text)
        tttt = summarize(tttt, word_count=100)
        text = [tttt]
    except:
        text = 'None'

    f.close()

    vectorizer = TfidfVectorizer(stop_words='english')
    vectorizer.fit(text)
    vector = vectorizer.transform(text)

    Job_Desc = vector.toarray()
   

    os.chdir('../')
    for i in Resumes:

        text = i
        tttt = str(text)
        try:
            tttt = summarize(tttt, word_count=100) 
            text = [tttt]
            vector = vectorizer.transform(text)

            aaa = vector.toarray()
            Resume_Vector.append(vector.toarray())
        except:
            pass

    for i in Resume_Vector:

        samples = i
        neigh = NearestNeighbors(n_neighbors=1)
        neigh.fit(samples) 
        NearestNeighbors(algorithm='auto', leaf_size=30)

        Ordered_list_Resume_Score.extend(neigh.kneighbors(Job_Desc)[0][0].tolist())

    Z = [x for _,x in sorted(zip(Ordered_list_Resume_Score,Ordered_list_Resume))]
    print(Ordered_list_Resume)
    print(Ordered_list_Resume_Score)
    flask_return = []
   

    for n,i in enumerate(Z):
   
        name = getfilepath(i)
        rank = n+1
        res = ResultElement(rank, name)
        flask_return.append(res)
        print(f"Rank{res.rank+1} :\t {res.filename}")
    return flask_return
Exemple #37
0
 def test_empty_text_summarization_is_empty_string(self):
     self.assertEqual(summarize(""), u"")
def get_summary(text):
    print("---")
    print("Summarizing text...")
    # word_count=150 seems to be the minimum...
    summary = summarize(text, word_count=150)
    return summary
Exemple #39
0
 def test_low_distinct_words_summarization_is_empty_string(self):
     text = self._get_text_from_test_data("testlowdistinctwords.txt")
     self.assertEqual(summarize(text), u"")
Exemple #40
0
def main():

    image = Image.open('images/wordcloud.png')

    st.sidebar.image(image, width=200)
    st.sidebar.header("NLP demos")
    st.sidebar.text("Select an option and see it in action!")

    st.title("Natural Language Processing demos")
    st.markdown("""
        #### An NLP app for demonstration purposes: analyze your text!
        

        """)

    # Named Entity Recognition

    if st.sidebar.checkbox("Named Entity Recognition", key='check1'):

        lang_options = st.selectbox("Choose language (EN/PT)", ['EN', 'PT'],
                                    key='sel1')

        if lang_options == 'EN':
            lang_model = 'en_core_web_sm'
        else:
            lang_model = 'pt_core_news_sm'

        message = st.text_area("Enter text inside the box...", key='ins1')

        if st.button("Run", key='run1'):
            with st.spinner('Wait for it...'):
                entity_result = entity_analyzer(message, lang_model)
            st.success(st.json(entity_result))

    # Summarization

    if st.sidebar.checkbox("Text Summarization", key='check2'):
        st.subheader("Summarize Your Text")

        message = st.text_area(
            "Enter text (EN only for now) inside the box...", key='ins2')

        ratio_value = st.slider(
            'Select a ratio (%) that determines the proportion of the number of sentences of the original text to be chosen for the summary',
            0, 100, (10))

        if st.button("Run", key='run2'):
            with st.spinner('Wait for it...'):
                summary_result = summarize(message, ratio=ratio_value / 100)
            st.success(summary_result)

    # # Automated Keyword Extraction

    # if st.sidebar.checkbox("Automated Keyword Extraction"):
    # 	st.subheader("Extract Keywords")

    # 	lang_options = st.selectbox("Choose language (EN/PT)",['EN','PT'])

    # 	if lang_options == 'EN':
    # 		lang_model = 'en'
    # 	elif lang_options == 'PT':
    # 		lang_model = 'pt'
    # 	else:
    # 		lang_model = 'en'

    # 	message = st.text_area("Enter text inside the box...")

    # 	if st.button("Run"):
    # 		with st.spinner('Wait for it...'):

    # 			# set YAKE! parameters
    # 			language = lang_model
    # 			max_ngram_size = 2
    # 			deduplication_thresold = 0.2
    # 			deduplication_algo = "seqm"
    # 			windowSize = 1
    # 			numOfKeywords = 10

    # 			custom_kw_extractor = yake.KeywordExtractor(
    # 				lan=language,
    # 				n=max_ngram_size,
    # 				dedupLim=deduplication_thresold,
    # 				dedupFunc=deduplication_algo,
    # 				windowsSize=windowSize,
    # 				top=numOfKeywords,
    # 				features=None,
    # 			)
    # 			keywords = custom_kw_extractor.extract_keywords(message)
    # 			keywords = [kw for kw, res in keywords]

    # 			st.success('Keywords: ' + (', '.join(sorted(keywords))))


# Automated Keyword Extraction

    if st.sidebar.checkbox("Automated Keyword Extraction", key='check3'):
        st.subheader("Extract Keywords")

        lang_options = st.selectbox("Choose language (EN/PT)", ['EN', 'PT'],
                                    key='sel2')

        if lang_options == 'EN':
            stop_words = en_stopwords
            lang_model = 'en_core_web_sm'
        else:
            lang_model = 'pt_core_news_sm'
            stop_words = pt_stopwords

        # nlp = spacy.load(lang_model)

        message = st.text_area("Enter text inside the box...", key='ins3')

        if st.button("Run", key='run3'):
            with st.spinner('Wait for it...'):

                # corpus = []

                text = ''.join([
                    unidecode.unidecode(accented_string)
                    for accented_string in message
                ])

                corpus = clean_string(text, lang_options)

                tr4w = TextRank4Keyword()
                tr4w.set_stopwords(stopwords=stop_words, lang_model=lang_model)
                # tr4w.set_stopwords(stopwords=stop_words)
                # tr4w.analyze(ppp, candidate_pos = ['NOUN', 'PROPN', 'VERB'], window_size=4, lower=False)
                tr4w.analyze(corpus,
                             window_size=4,
                             lower=False,
                             lang_model=lang_model)

                st.success('Keywords: ' +
                           (', '.join(sorted(tr4w.get_keywords(10)))))

    # Data Anonymization (erasing names)

    if st.sidebar.checkbox("Anonymize Personal Data"):
        st.subheader("Anonymize Your Data: Hiding Names")

        lang_options = st.selectbox("Choose language (EN/PT)", ['EN', 'PT'],
                                    key='sel3')

        if lang_options == 'EN':
            lang_model = 'en_core_web_sm'
        elif lang_options == 'PT':
            lang_model = 'pt_core_news_sm'
        else:
            lang_model = 'en_core_web_sm'

        message = st.text_area("Enter text inside the box...", key='ins4')

        if st.button("Run", key='run4'):
            with st.spinner('Wait for it...'):
                names_cleaned_result = sanitize_names(message, lang_model)
                st.success(names_cleaned_result)

    # N-grams

    if st.sidebar.checkbox("N-Grams Barplot"):
        st.subheader("Visualize an N-grams barplot")

        lang_option = st.selectbox("Choose language (EN/PT)", ['EN', 'PT'],
                                   key='sel4')

        # if lang_options == 'EN':
        #     lang_model = 'english'
        # elif lang_options == 'PT':
        #     lang_model = 'portuguese'
        # else:
        #     lang_model = 'english'

        ngram_option = st.selectbox("Choose N for N-grams (1, 2 or 3)",
                                    [1, 2, 3],
                                    key='sel5')

        # if ngram_options == 1:
        #     ngrams = 1
        # elif ngram_options == 2:
        #     ngrams = 2
        # else:
        #     ngrams = 3

        message = st.text_area("Let's analyze and get some visuals...",
                               key='ins5')

        if st.button("Run", key='run5'):
            with st.spinner('Wait for it...'):
                corpus = []

                text = ''.join([
                    unidecode.unidecode(accented_string)
                    for accented_string in message
                ])

                corpus.append(clean_string(text, lang_option))

                top3_words = get_top_n_words(corpus, ngram_option, n=20)
                top3_df = pd.DataFrame(top3_words)
                top3_df.columns = ["N-gram", "Freq"]
                fig = px.bar(top3_df, x='N-gram', y='Freq')

                st.plotly_chart(fig)

    # Wordcloud

    if st.sidebar.checkbox("Wordcloud"):
        st.subheader("Visualize a wordcloud")

        lang_option = st.selectbox("Choose language (EN/PT)", ['EN', 'PT'],
                                   key='sel6')

        if lang_option == 'EN':
            # lang_model = 'en_core_web_sm'
            stop_words = en_stopwords
        else:
            # lang_model = 'pt_core_news_sm'
            stop_words = pt_stopwords

        message = st.text_area("Let's analyze and get some visuals...",
                               key='ins6')

        if st.button("Run", key='run6'):
            with st.spinner('Wait for it...'):
                corpus = []

                text = ''.join([
                    unidecode.unidecode(accented_string)
                    for accented_string in message
                ])

                corpus.append(clean_string(text, lang_option))

                #Word cloud
                wordcloud = WordCloud(background_color='white',
                                      stopwords=stop_words,
                                      max_words=100,
                                      max_font_size=50,
                                      random_state=42).generate(str(corpus))
                fig = plt.figure(1)
                plt.imshow(wordcloud, interpolation="bilinear")
                plt.axis('off')
                st.pyplot()
Exemple #41
0
def get_content(url):
    prCyan('BS4 Pull Request...')
    headers = requests.utils.default_headers()
    headers.update({
        'User-Agent':
        'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.169 Safari/537.36',
    })
    page = ''
    while page == '':
        try:
            page = requests.get(url)
            break
        except:
            print("Connection refused by the server..")
            print("Let me sleep for 5 seconds")
            print("ZZzzzz...")
            time.sleep(5)
            print("Was a nice sleep, now let me continue...")
            continue
    raw_html = page.content
    soup = BeautifulSoup(raw_html, 'html.parser')
    results = pullContent(soup)
    prGreen('BS4 Original Content:')
    print(results)
    headers = soup.select("h1")
    header = ""
    if len(headers) != 0:
        header = headers[0].text
    if (("Forbidden" in header) or (header == "")
            or ("Access Denied" in header) or ("400 Bad Request" in header)
            or ("Error" in header)):
        header = "Title Could not be Retrieved due to Webpage Restrictions"
    prCyan('Title:')
    print(header)
    final_text_summary = []

    # Check if content can be pulled with BS4
    """word count minimum"""
    validThreshold = 300
    if len(results.split(" ")) < validThreshold:
        # Selenium Pull
        sel_driver.implicitly_wait(1)  # reduce error
        sel_driver.get(url)
        soup = BeautifulSoup(sel_driver.page_source, "html.parser")
        results = pullContent(soup)
        prCyan('Selenium Original Content:')
        print(results)
    """Output with summariser"""
    # apply final regex clean up before summarising
    results = re.sub(r"\{(.*?)\}+", '',
                     results)  #removes anything enclosing {}
    results = re.sub(r"(#[A-Za-z]+)", '', results)  #removes hashtags
    results = re.sub(r"(^.+@[^\.].*\.[a-z]{2,}$)", '', results)  #removes email
    prCyan('After Regex...')
    print(results)
    final_results = summarize(results)
    prCyan('With text summary:')
    print(final_results)
    final_text_summary.append(header)
    final_text_summary.append(final_results)

    return final_text_summary
from bs4 import BeautifulSoup
from gensim.summarization import summarize

url = 'https://www.gp.se/nyheter/sverige/sverige-%C3%A4ndrar-inte-strategi-trots-who-besked-1.26396387'
page = requests.get(url).text

soup = BeautifulSoup(page)

headline = soup.find('h1').get_text()
print(headline)

p_tags = soup.find_all('p')

p_tags_text = [tag.get_text().strip() for tag in p_tags]
p_tags_text

sentence_list = [sentence for sentence in p_tags_text if not '\n' in sentence]
sentence_list = [sentence for sentence in sentence_list if '.' in sentence]
sentence_list

article = ' '.join(sentence_list)

summary = summarize(article, ratio=0.3)

print(f'Length of original article: {len(article)}')
print(f'Length of summary: {len(summary)} \n')
print(f'Headline: {headline} \n')
print(f'Article Summary:\n{summary}')
print("Press enter to exit")
input()
Exemple #43
0
def abstract(txt_name):
    with open(txt_name, 'r') as myfile:
        article=myfile.read().replace('\n', '')
    return summarize(article,ratio=0.2)
Exemple #44
0
def upload_file():
    if request.method == 'POST':
        # Removing Files
        # Pfiles = glob.glob('uploads/*')
        # for f in Pfiles:
        #     os.remove(f)

        # Pfiles2 = glob.glob('uploads2/*')
        # for f in Pfiles2:
        #     os.remove(f)
        req = request.form
        # check if the post request has the file part
        if req['form-name'] == 'form1':
            text_area = req['txtarea']
            percent = int(req['percent'])
            percent = percent / 100
            print(percent)
            sum_text = summarize(text_area, percent)
            return render_template('upload_file.html',
                                   in_text=text_area,
                                   sum_text=sum_text)
        if 'file' not in request.files:
            print('no file')
            return redirect(request.url)
        file = request.files['file']
        # if user does not select file, browser also
        # submit a empty part without filename
        if file.filename == '':
            print('no filename')
            return redirect(request.url)
        else:
            filename = secure_filename(file.filename)

            file.save(os.path.join(app.config['UPLOAD_FOLDER'], filename))

            text = " ".join((line for line in open(os.path.join(
                app.config['UPLOAD_FOLDER'], filename),
                                                   encoding='utf-8')))
            # os.remove(os.path.join(app.config['UPLOAD_FOLDER'], filename))
            percent = int(req['percent'])
            percent = percent / 100
            print(text)
            text2 = summarize(text, percent)
            print("after summary.....")

            # file_path = app.config['UPLOAD_FOLDER2']
            print(filename)
            # filename= "SUM_" + filename
            # files = open(file_path+ sum_file, 'w')
            # files.write(text2)
            # files.close()
            with open(os.path.join(app.config['UPLOAD_FOLDER2'], filename),
                      'w') as f:
                f.write(text2)
            print("here is the filename : ", filename)

            print("saved file successfully")
            #send file name as parameter to downlad
            # file_path = os.path.join(app.config['UPLOAD_FOLDER2'], filename)
            # print("file for sent is ::: ")
            # return send_file(file_path, as_attachment=True, attachment_filename='')
            # file_path = os.path.join(app.config['UPLOAD_FOLDER2'], filename)

            return redirect('/downloadfile/' + filename)
    return render_template('upload_file.html',
                           in_text="Text area for input ...")
Exemple #45
0
 def set_summary2(self, text):
     summary = summarize(text, ratio=0.5)
     self.summary = summary
summary_length = int(math.ceil(len(sent_scores) / 5))
sent_scores.sort(key=lambda sent: sent[0])
text = frame["Content"][article_id]
summary = ""
for summary_sentence in sent_scores[:summary_length]:
    summary = summary + summary_sentence[1]
print(text)

print("summary content------")

print(summary)
## Gensim summarization using TextRank ##graphbase
from gensim.summarization import summarize
from gensim.summarization import keywords

reference = summarize(text)
print(reference)

## evaluation

from pythonrouge.pythonrouge import Pythonrouge

rouge = Pythonrouge(summary_file_exist=False,
                    summary=summary,
                    reference=reference,
                    n_gram=2,
                    ROUGE_SU4=True,
                    ROUGE_L=False,
                    recall_only=True,
                    stemming=True,
                    stopwords=True,
from gensim.summarization import summarize
from gensim.summarization import keywords
import pandas as pd 


if __name__ == "__main__":
	df = pd.read_csv('newtestament.txt', delimiter="|", skiprows=0, names=['Book', 'Chapter', 'Verse', 'Original Text'])
	text = df[df['Book'] == 'Mat']['Original Text'].values

	alltext = ''
	for verse in text:
	    alltext += verse

	print summarize(alltext, ratio = 0.005)
	print keywords(text, ratio = 0.01)

Exemple #48
0
# -*- coding: utf-8 -*-

text = "August 21, 2020 Mr. Abner Kenny Northern Insurance P.O. Box 337 Milwaukee, WI Date of incident: July 12, 2020 Dear Mr. Kenny As you know, I was involved in a collision with a van owned by your insured on Chestnut St. in Waukesha, WI. I was waiting at a stop sign, when the Jenkins Hardware van rear-ended me. I was not injured, but my car suffered a fair amount of damage, which, despite repeated phone calls, Northern Insurance has so far refused to pay for. The Jenkins driver was obviously negligent. He rear-ended someone waiting at a stop sign. It is an open-and-shut case. As a result of this incident, my trunk was caved in. I have a small Honda, and small cars don’t tend to fare very well when they are hit by commercial vans. I brought it to my usual mechanic, who recommended that I go to Waukesha Body Shop, where they gave me an estimate for $4,600 for a full repair. I have attached another copy of the estimate, although I have sent it to you twice before. You also have pictures of the damage. My car is only 2 years old, and is worth far more than that. I understand that your estimator valued the repair costs at $4,000. That is not that far off. I don’t understand why we haven’t been able to agree on a repair price. Taking into account your insured’s absolute liability and my damages in this case, I demand $4,600.00 to settle this case. This is not a complex claim. If I do not hear from you in one week, I will call the Wisconsin Department of Insurance to file a complaint against you. Very truly yours, Fred Smith"
print(text)
print("------------------------------------------")
from gensim.summarization import summarize
from gensim.summarization import keywords
# summarizes the text in 90 words, can just change this word_count
text_summary = summarize(text, word_count=90)
print(text_summary)
# Keywords
text_keywords = keywords(text)
print(text_keywords)
 def test_empty_text_summarization_none(self):
     self.assertTrue(summarize("") is None)
Exemple #50
0
 def summarize_text(text, split=True):
     return summarize(text, split=split)
Exemple #51
0
def res(jobfile):
    Resume_Vector = []
    Ordered_list_Resume = []
    Ordered_list_Resume_Score = []
    LIST_OF_FILES = []
    LIST_OF_FILES_PDF = []
    LIST_OF_FILES_DOC = []
    LIST_OF_FILES_DOCX = []
    Resumes = []
    Temp_pdf = []
    os.chdir('./Original_Resumes')
    for file in glob.glob('**/*.pdf', recursive=True):
        LIST_OF_FILES_PDF.append(file)
    for file in glob.glob('**/*.doc', recursive=True):
        LIST_OF_FILES_DOC.append(file)
    for file in glob.glob('**/*.docx', recursive=True):
        LIST_OF_FILES_DOCX.append(file)

    LIST_OF_FILES = LIST_OF_FILES_DOC + LIST_OF_FILES_DOCX + LIST_OF_FILES_PDF
    # LIST_OF_FILES.remove("antiword.exe")
    print("This is LIST OF FILES")
    print(LIST_OF_FILES)

    # print("Total Files to Parse\t" , len(LIST_OF_PDF_FILES))
    print("####### PARSING ########")
    for nooo, i in enumerate(LIST_OF_FILES):
        Ordered_list_Resume.append(i)
        Temp = i.split(".")
        if Temp[1] == "pdf" or Temp[1] == "Pdf" or Temp[1] == "PDF":
            try:
                print("This is PDF", nooo)
                with open(i, 'rb') as pdf_file:
                    read_pdf = PyPDF2.PdfFileReader(pdf_file)
                    # page = read_pdf.getPage(0)
                    # page_content = page.extractText()
                    # Resumes.append(Temp_pdf)

                    number_of_pages = read_pdf.getNumPages()
                    for page_number in range(number_of_pages):

                        page = read_pdf.getPage(page_number)
                        page_content = page.extractText()
                        page_content = page_content.replace('\n', ' ')
                        # page_content.replace("\r", "")
                        Temp_pdf = str(Temp_pdf) + str(page_content)
                        # Temp_pdf.append(page_content)
                        # print(Temp_pdf)
                    Resumes.extend([Temp_pdf])
                    Temp_pdf = ''
                    # f = open(str(i)+str("+") , 'w')
                    # f.write(page_content)
                    # f.close()
            except Exception as e:
                print(e)
        if Temp[1] == "doc" or Temp[1] == "Doc" or Temp[1] == "DOC":
            print("This is DOC", i)

            try:
                a = textract.process(i)
                a = a.replace(b'\n', b' ')
                a = a.replace(b'\r', b' ')
                b = str(a)
                c = [b]
                Resumes.extend(c)
            except Exception as e:
                print(e)

        if Temp[1] == "docx" or Temp[1] == "Docx" or Temp[1] == "DOCX":
            print("This is DOCX", i)
            try:
                a = textract.process(i)
                a = a.replace(b'\n', b' ')
                a = a.replace(b'\r', b' ')
                b = str(a)
                c = [b]
                Resumes.extend(c)
            except Exception as e:
                print(e)

        if Temp[1] == "ex" or Temp[1] == "Exe" or Temp[1] == "EXE":
            print("This is EXE", i)
            pass

    print("Done Parsing.")

    Job_Desc = 0
    LIST_OF_TXT_FILES = []
    os.chdir('../Job_Description')
    f = open(jobfile, 'r', encoding="utf8", errors="ignore")
    text = f.read()

    try:
        tttt = str(text)
        tttt = summarize(tttt, word_count=100)
        text = [tttt]
    except:
        text = 'None'

    f.close()

    vectorizer = TfidfVectorizer(stop_words='english')
    # print(text)
    vectorizer.fit(text)
    vector = vectorizer.transform(text)

    Job_Desc = vector.toarray()
    #print("\n\n")
    #print("This is job desc : " , Job_Desc)

    name_string = []
    linked_in = []
    os.chdir('../')
    #print("printing listdir:    ")
    #os.listdir('../')
    k = 0
    for i in Resumes:
        print("hello000000000000000000")
        name_s = ""
        links = str(k) + "none"
        k += 1
        text = i
        tttt = str(text)
        if (len(tttt.strip()) > 2):
            name_s = tttt.split()[0] + " " + tttt.split()[1]
            if ("name" in tttt):
                bt = tttt[(tttt.index("name") + 5):]
                name_s = bt.split()[0] + " " + bt.split()[1]
            if ("nam e" in tttt):
                bt = tttt[(tttt.index("nam e") + 5):]
                name_s = bt.split()[0] + " " + bt.split()[1]
            if ("Name" in tttt):
                bt = tttt[(tttt.index("Name") + 5):]
                name_s = bt.split()[0] + " " + bt.split()[1]
            if ("linkedin.com" in tttt):
                indexx = tttt.index("linkedin.com")
                y1 = tttt[:indexx:]
                y1 = y1[::-1]
                y2 = tttt[indexx::]
                starter = ""
                for i in y1:
                    if (i == ' '):
                        break
                    else:
                        starter = starter + i
                starter = starter[::-1]
                ender = ""
                for i in y2:
                    if (i == ' '):
                        break
                    else:
                        ender = ender + i
                links = starter + ender
            if ("gmail.com" in tttt):
                print("yeahhhhhhhhhhhhhhhhhhhhhhhhhhhhhh")
                indexx = tttt.index("gmail.com")
                y1 = tttt[:indexx:]
                y1 = y1[::-1]
                y2 = tttt[indexx::]
                starter = ""
                for i in y1:
                    if (i == ' '):
                        break
                    else:
                        starter = starter + i
                starter = starter[::-1]
                ender = ""
                for i in y2:
                    if (i == ' '):
                        break
                    else:
                        ender = ender + i
                links = links + '\n' + starter + ender
            if ("github.com" in tttt):
                indexx = tttt.index("github.com")
                y1 = tttt[:indexx:]
                y1 = y1[::-1]
                y2 = tttt[indexx::]
                starter = ""
                for i in y1:
                    if (i == ' '):
                        break
                    else:
                        starter = starter + i
                starter = starter[::-1]
                ender = ""
                for i in y2:
                    if (i == ' '):
                        break
                    else:
                        ender = ender + i
                links = links + '\n' + starter + ender
        linked_in.append(links)
        name_string.append(name_s)
        try:
            tttt = summarize(tttt, word_count=100)
            text = [tttt]
            vector = vectorizer.transform(text)

            aaa = vector.toarray()
            Resume_Vector.append(vector.toarray())
        except:
            pass
    # print(Resume_Vector)
    print("heloooooooo")
    for i in Resume_Vector:
        samples = i
        neigh = NearestNeighbors(n_neighbors=1)
        neigh.fit(samples)
        NearestNeighbors(algorithm='auto', leaf_size=30)

        Ordered_list_Resume_Score.extend(
            neigh.kneighbors(Job_Desc)[0][0].tolist())

    Z = [
        x
        for _, x in sorted(zip(Ordered_list_Resume_Score, Ordered_list_Resume))
    ]
    sorted_names = [
        x for _, x in sorted(zip(Ordered_list_Resume_Score, name_string))
    ]
    linked_in_res = [
        x for _, x in sorted(zip(Ordered_list_Resume_Score, linked_in))
    ]
    print(Ordered_list_Resume)
    print(linked_in)
    print(Ordered_list_Resume_Score)
    print(Z)
    print(linked_in_res)
    return_flask_name = []
    flask_return = []
    for x in Z:
        if x == "Resume_Arunima_Shukla.pdf":
            y1 = Z.index("Resume_Arunima_Shukla.pdf")
            linked_in_res[
                y1] = "https://www.linkedin.com/in/arunima-shukla-pg/ \n https://github.com/arunima811"
        if x == "Updated_Ishank_Vasania_CSE_Resume.pdf":
            y1 = Z.index("Updated_Ishank_Vasania_CSE_Resume.pdf")
            linked_in_res[y1] = "https://www.linkedin.com/in/ishank-vasania/"
    # for n,i in enumerate(Z):
    #     print("Rankkkkk\t" , n+1, ":\t" , i)
    #Z=Z[::-1]
    #sorted_names=sorted_names[::-1]
    for n, i in enumerate(Z):
        # print("Rank\t" , n+1, ":\t" , i)
        # flask_return.append(str("Rank\t" , n+1, ":\t" , i))
        name = getfilepath(i)
        #name = name.split('.')[0]
        rank = n + 1
        res = ResultElement(rank, name)
        flask_return.append(res)
        # res.printresult()
        print(f"Rank {res.rank} :\t {res.filename}")
    return_flask_name.append(flask_return)
    return_flask_name.append(sorted_names)
    return_flask_name.append(linked_in_res)
    return return_flask_name
Exemple #52
0
def main():
	""" NLP Based App with Streamlit """

	# Title
	st.title("NLPiffy with Streamlit")
	st.subheader("Natural Language Processing On the Go..")
	st.markdown("""
    	#### Description
    	+ This is a Natural Language Processing(NLP) Based App useful for basic NLP task
    	Tokenization,NER,Sentiment,Summarization

    	""")

	# Tokenization
	if st.checkbox("Show Tokens and Lemma"):
		st.subheader("Tokenize Your Text")

		message = st.text_area("Enter Text","Type Here ..")
		if st.button("Analyze"):
			nlp_result = text_analyzer(message)
			st.json(nlp_result)

	# Entity Extraction
	if st.checkbox("Show Named Entities"):
		st.subheader("Analyze Your Text")

		message = st.text_area("Enter Text","Type Here ..")
		if st.button("Extract"):
			entity_result = entity_analyzer(message)
			st.json(entity_result)

	# Sentiment Analysis
	if st.checkbox("Show Sentiment Analysis"):
		st.subheader("Analyse Your Text")

		message = st.text_area("Enter Text","Type Here ..")
		if st.button("Analyze"):
			blob = TextBlob(message)
			result_sentiment = blob.sentiment
			st.success(result_sentiment)

	# Summarization
	if st.checkbox("Show Text Summarization"):
		st.subheader("Summarize Your Text")

		message = st.text_area("Enter Text","Type Here ..")
		summary_options = st.selectbox("Choose Summarizer",['sumy','gensim'])
		if st.button("Summarize"):
			if summary_options == 'sumy':
				st.text("Using Sumy Summarizer ..")
				summary_result = sumy_summarizer(message)
			elif summary_options == 'gensim':
				st.text("Using Gensim Summarizer ..")
				summary_result = summarize(rawtext)
			else:
				st.warning("Using Default Summarizer")
				st.text("Using Gensim Summarizer ..")
				summary_result = summarize(rawtext)

		
			st.success(summary_result)



	st.sidebar.subheader("About App")
	st.sidebar.text("NLPiffy App with Streamlit")
	st.sidebar.info("Cudos to the Streamlit Team")
	

	st.sidebar.subheader("By")
	st.sidebar.text("Jesse E.Agbe(JCharis)")
	st.sidebar.text("Jesus saves@JCharisTech")
Exemple #53
0
def findSummary(text, ratio):
    print('Summary:')
    return summarize(text, ratio)
import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

from gensim.summarization import summarize

# answers_input = "Encapsulation is a strategy used as part of abstraction. Encapsulation refers to the state of objects - objects encapsulate their state and hide it from the outside; outside users of the class interact with it through its methods, but cannot access the classes state directly. So the class abstracts away the implementation details related to its state. Abstraction is a more generic term, it can also be achieved by (amongst others) subclassing. For example, the interface List in the standard library is an abstraction for a sequence of items, indexed by their position, concrete examples of a List are an ArrayList or a LinkedList. Code that interacts with a List abstracts over the detail of which kind of a list it is using. Abstraction is often not possible without hiding underlying state by encapsulation - if a class exposes its internal state, it can't change its inner workings, and thus cannot be abstracted.\n"+\
# 	"Abstraction is the concept of describing something in simpler terms, i.e abstracting away the details, in order to focus on what is important (This is also seen in abstract art, for example, where the artist focuses on the building blocks of images, such as colour or shapes). The same idea translates to OOP by using an inheritance hierarchy, where more abstract concepts are at the top and more concrete ideas, at the bottom, build upon their abstractions. At its most abstract level there is no implementation details at all and perhaps very few commonalities, which are added as the abstraction decreases. As an example, at the top might be an interface with a single method, then the next level, provides several abstract classes, which may or may not fill in some of the details about the top level, but branches by adding their own abstract methods, then for each of these abstract classes are concrete classes providing implementations of all the remaining methods. Encapsulation is a technique. It may or may not be for aiding in abstraction, but it is certainly about information hiding and/or organisation. It demands data and functions be grouped in some way - of course good OOP practice demands that they should be grouped by abstraction. However, there are other uses which just aid in maintainability etc.\n"+\
# 	"Abstraction is the process of refining away all the unneeded/unimportant attributes of an object and keep only the characteristics best suitable for your domain. E.g. for a person: you decide to keep first and last name and SSN. Age, height, weight etc are ignored as irrelevant. Abstraction is where your design starts. Encapsulation is the next step where it recognizes operations suitable on the attributes you accepted to keep during the abstraction process. It is the association of the data with the operation that act upon them. I.e. data and methods are bundled together.\n"+\
# 	"Abstraction is used to show important and relevant data to user. best real world example In a mobile phone, you see their different types of functionalities as camera, mp3 player, calling function, recording function, multimedia etc. It is abstraction, because you are seeing only relevant information instead of their internal engineering. Encapsulation is hiding unnecessary data in a capsule or unit. Abstraction is showing essential feature of an object. Encapsulation is used to hide its member from outside class and interface.Using access modifiers provided in c#.like public,private,protected etc. \n"+\
# 	" Abstraction is a very general term, and abstraction in software is not limited to object-oriented languages. A dictionary definition: the act of considering something as a general quality or characteristic, apart from concrete realities, specific objects, or actual instances. Assembly language can be thought of as an abstraction of machine code -- assembly expresses the essential details and structure of the machine code, but frees you from having to think about the opcodes used, the layout of the code in memory, making jumps go to the right address, etc. Your operating system's API is an abstraction of the underlying machine. Your compiler provides a layer of abstraction which shields you from the details of assembly language. The TCP/IP stack built into your operating system abstracts away the details of transmitting bits over a network. If you go down all the way to the raw silicon, the people who designed your CPU did so using circuit diagrams written in terms of diodes and transistors, which are abstractions of how electrons travel through semiconductor crystals. In software, everything is an abstraction. We build programs which simulate or model some aspect of reality, but by necessity our models always abstract away some details of the real thing. We build layer on layer on layer of abstractions, because it is the only way we get anything done. (Imagine you were trying to make, say, a sudoku solver, and you had to design it using only semiconductor crystals. OK, I need a piece of N-type silicon here...)In comparison, encapsulation is a very specific and limited term. Some of the other answers to this question have already given good definitions for it.\n" +\
# 	"In Object oriented programming, we have something called classes. What are they for? They are to store some state and to store some methods to change that state i.e., they are encapsulating state and its methods. It(class) does not care about the visibility of its own or of its contents. If we choose to hide the state or some methods, it is information hiding. Now, take the scenario of an inheritance. We have a base class and a couple of derived (inherited) classes. So, what is the base class doing here? It is abstracting out some things from the derived classes.\n" + \
# 	"  Encapsulation - the process of hiding components of the class to prevent direct access from the outside. It is achieved by using private modifier to prevent direct access to some class members (data field or method) from other classes or objects meanwhile providing access to these private members by public members (Interface). That make the class members protected as human organs hidden/encapsulated under the skin or some shield.  Abstraction - A principle must be followed in writing OOP program that say you must include in the class only components that are interesting in the task of the program. For example: the object student has a lot of characters as a human: name, age, weight, hair color, eye color, etc. But, when you create a class in OOP to work with students you should include only those characters that really matter for student database: name, age, specialty, level, marks ... etc. in C++ you can create abstract class by using the modifier virtual with any methods in the class and that will make it unusable in direct but you can derive other classes from it and create implementation for its members with adding required members based on the task. \n"+\
# 	"Abstraction:  Technical Definition :- Abstraction is a concept to hide unnecessary details(complex or simple) and only show the essential features of the object. There is no implementaion here its just an concept  What it means practically:- When i say my company needs some medium/device so that employees can connect to customer . This is the purest form of abstaction(like interface in java) as that device/medium can be phone or internet or skype or in person or email etc. I am not going into nitty gritty of device/medium  Even when i say my company needs some medium/device so that employees can connect to customer through voice call. Then also i am talking abstract but at bit lower level as device/medium can be phone or skype or something else etc  Now when i say my company needs some phone so that employees can connect to customer through voice call. Then also i am talking abstract but at bit lower level as phone can be of any company like iphone or samsung or nokia etc  Encapsulation:- Its basically about hiding the state(information) of object with the help of modifiers like private,public,protected etc. we expose the state thru public methods only if require.  What it means practically:- Now when i say my company needs some iphone so that employees can connect to customer through voice call.Now i am talking about some concrete object(like iphone). Even though i am not going into nitty gritty of iphone here too but iphone has some state/concrecrete info/implementation associated with it where device/medium does not have. When i say concrete object, actually it means any object which has some(not complete like java abstract class) implementation/info associated with it.  So iphone actually used here encapsulation as strategy to hide its state/information and expose only the ones which it think should be exposed. So both abstraction and encapsulation hides some unnecessary details but abstraction at the concept level and encapsulation actually at implementation level"

answers_input = "Encapsulation is a strategy used as part of abstraction\n Encapsulation refers to the state of objects - objects encapsulate their state and hide it from the outside; outside users of the class interact with it through its methods, but cannot access the classes state directly\n So the class abstracts away the implementation details related to its state\n Abstraction is a more generic term, it can also be achieved by (amongst others) subclassing\n For example, the interface List in the standard library is an abstraction for a sequence of items, indexed by their position, concrete examples of a List are an ArrayList or a LinkedList\n Code that interacts with a List abstracts over the detail of which kind of a list it is using\n Abstraction is often not possible without hiding underlying state by encapsulation - if a class exposes its internal state, it can't change its inner workings, and thus cannot be abstracted\n Abstraction is the concept of describing something in simpler terms, i\ne abstracting away the details, in order to focus on what is important (This is also seen in abstract art, for example, where the artist focuses on the building blocks of images, such as colour or shapes)\n The same idea translates to OOP by using an inheritance hierarchy, where more abstract concepts are at the top and more concrete ideas, at the bottom, build upon their abstractions\n At its most abstract level there is no implementation details at all and perhaps very few commonalities, which are added as the abstraction decreases\n As an example, at the top might be an interface with a single method, then the next level, provides several abstract classes, which may or may not fill in some of the details about the top level, but branches by adding their own abstract methods, then for each of these abstract classes are concrete classes providing implementations of all the remaining methods\n Encapsulation is a technique\n It may or may not be for aiding in abstraction, but it is certainly about information hiding and/or organisation\n It demands data and functions be grouped in some way - of course good OOP practice demands that they should be grouped by abstraction\n However, there are other uses which just aid in maintainability etc\n Abstraction is the process of refining away all the unneeded/unimportant attributes of an object and keep only the characteristics best suitable for your domain\n E\ng\n for a person: you decide to keep first and last name and SSN\n Age, height, weight etc are ignored as irrelevant\n Abstraction is where your design starts\n Encapsulation is the next step where it recognizes operations suitable on the attributes you accepted to keep during the abstraction process\n It is the association of the data with the operation that act upon them\n I\ne\n data and methods are bundled together\n Abstraction is used to show important and relevant data to user\n best real world example In a mobile phone, you see their different types of functionalities as camera, mp3 player, calling function, recording function, multimedia etc\n It is abstraction, because you are seeing only relevant information instead of their internal engineering\n Encapsulation is hiding unnecessary data in a capsule or unit\n Abstraction is showing essential feature of an object\n Encapsulation is used to hide its member from outside class and interface\nUsing access modifiers provided in c#\nlike public,private,protected etc\n   Abstraction is a very general term, and abstraction in software is not limited to object-oriented languages\n A dictionary definition: the act of considering something as a general quality or characteristic, apart from concrete realities, specific objects, or actual instances\n Assembly language can be thought of as an abstraction of machine code -- assembly expresses the essential details and structure of the machine code, but frees you from having to think about the opcodes used, the layout of the code in memory, making jumps go to the right address, etc\n Your operating system's API is an abstraction of the underlying machine\n Your compiler provides a layer of abstraction which shields you from the details of assembly language\n The TCP/IP stack built into your operating system abstracts away the details of transmitting bits over a network\n If you go down all the way to the raw silicon, the people who designed your CPU did so using circuit diagrams written in terms of diodes and transistors, which are abstractions of how electrons travel through semiconductor crystals\n In software, everything is an abstraction\n We build programs which simulate or model some aspect of reality, but by necessity our models always abstract away some details of the real thing\n We build layer on layer on layer of abstractions, because it is the only way we get anything done\n (Imagine you were trying to make, say, a sudoku solver, and you had to design it using only semiconductor crystals\n OK, I need a piece of N-type silicon here\n\n\n)In comparison, encapsulation is a very specific and limited term\n Some of the other answers to this question have already given good definitions for it\n In Object oriented programming, we have something called classes\n What are they for? They are to store some state and to store some methods to change that state i\ne\n, they are encapsulating state and its methods\n It(class) does not care about the visibility of its own or of its contents\n If we choose to hide the state or some methods, it is information hiding\n Now, take the scenario of an inheritance\n We have a base class and a couple of derived (inherited) classes\n So, what is the base class doing here? It is abstracting out some things from the derived classes\n   Encapsulation - the process of hiding components of the class to prevent direct access from the outside\n It is achieved by using private modifier to prevent direct access to some class members (data field or method) from other classes or objects meanwhile providing access to these private members by public members (Interface)\n That make the class members protected as human organs hidden/encapsulated under the skin or some shield\n  Abstraction - A principle must be followed in writing OOP program that say you must include in the class only components that are interesting in the task of the program\n For example: the object student has a lot of characters as a human: name, age, weight, hair color, eye color, etc\n But, when you create a class in OOP to work with students you should include only those characters that really matter for student database: name, age, specialty, level, marks \n\n\n etc\n in C++ you can create abstract class by using the modifier virtual with any methods in the class and that will make it unusable in direct but you can derive other classes from it and create implementation for its members with adding required members based on the task\n  Abstraction:  Technical Definition :- Abstraction is a concept to hide unnecessary details(complex or simple) and only show the essential features of the object\n There is no implementaion here its just an concept  What it means practically:- When i say my company needs some medium/device so that employees can connect to customer \n This is the purest form of abstaction(like interface in java) as that device/medium can be phone or internet or skype or in person or email etc\n I am not going into nitty gritty of device/medium  Even when i say my company needs some medium/device so that employees can connect to customer through voice call\n Then also i am talking abstract but at bit lower level as device/medium can be phone or skype or something else etc  Now when i say my company needs some phone so that employees can connect to customer through voice call\n Then also i am talking abstract but at bit lower level as phone can be of any company like iphone or samsung or nokia etc  Encapsulation:- Its basically about hiding the state(information) of object with the help of modifiers like private,public,protected etc\n we expose the state thru public methods only if require\n  What it means practically:- Now when i say my company needs some iphone so that employees can connect to customer through voice call\nNow i am talking about some concrete object(like iphone)\n Even though i am not going into nitty gritty of iphone here too but iphone has some state/concrecrete info/implementation associated with it where device/medium does not have\n When i say concrete object, actually it means any object which has some(not complete like java abstract class) implementation/info associated with it\n  So iphone actually used here encapsulation as strategy to hide its state/information and expose only the ones which it think should be exposed\n So both abstraction and encapsulation hides some unnecessary details but abstraction at the concept level and encapsulation actually at implementation level"

print summarize(answers_input, word_count = 200)
                doc_id, text_body = doc_id_text_generator.next()
            except StopIteration:
                print 'not enough docs found, breaking'
                break
            concat_txt = ' '.join([concat_txt, text_body[:args.single_doc_len]])
            breakout += 1

        print 'used %i concatenated docs for this topic' % breakout
        print 'actual character length of concatenated docs: %i' % len(concat_txt)

        # make sure you have something
        if len(concat_txt) == 0:
            print 'got nothing for this topic'
            continue

        # TODO: make arga
        generate_keywords = True
        generate_sentences = True

        if generate_keywords:
            print '\ngenerating keywords\n------------------------------\n'
            summary = keywords(concat_txt, ratio=args.summary_ratio, split=True, lemmatize=True)
            print ', '.join(summary)
        if generate_sentences:
            print '\ngenerating sentences\n------------------------------\n'
            summary = summarize(concat_txt, split=True, ratio=args.summary_ratio)
            for sentence in summary:
                print ' * ' + sentence

        # it's sentence or keyword depending on --sentence flag
Exemple #56
0
def saving_npy(direction, methods=['GET', 'POST']):
    filename = getsuid(direction)
    filename = filename.split("/")[-1]
    datasub = candidates[0:0]
    for j in range(candidates.shape[0]):
        if (candidates.seriesuid[j] == filename):
            datasub = datasub.append(candidates.loc[j])
    datasub['file'] = "D:\\subset6\\" + datasub.seriesuid + ".mhd"
    datapos = datasub[datasub['class'] == 1]
    dataneg = datasub[datasub['class'] == 0]
    dataneg = dataneg.sample(n=datapos.shape[0], random_state=42)
    path = ''
    for j in range(datapos.shape[0]):
        ineed = getdata(datapos.iloc[j])
        path = "D:\\npy\\uploads\\save_npy\\1" + "\\" + "\\" + datapos.iloc[j][
            0] + str(datapos.iloc[j][1]) + str(datapos.iloc[j][1]) + str(
                datapos.iloc[j][2]) + str(datapos.iloc[j][3]) + ".npy"
        np.save(path, ineed)
    for j in range(datapos.shape[0]):
        ineed = getdata(dataneg.iloc[j])
        path = "D:\\npy\\uploads\\save_npy\\0" + "\\" + "\\" + dataneg.iloc[j][
            0] + str(dataneg.iloc[j][1]) + str(dataneg.iloc[j][1]) + str(
                dataneg.iloc[j][2]) + str(dataneg.iloc[j][3]) + ".npy"
        np.save(path, ineed)
    print("path:  ", path)
    a = np.load(path)
    plt.imshow(a, cmap=plt.cm.gray)
    imsave(path.replace(".npy", ".png"), a)
    img = cv2.imread(path.replace(".npy", ".png"))
    gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
    ret, thresh = cv2.threshold(gray, 0, 255,
                                cv2.THRESH_BINARY_INV + cv2.THRESH_OTSU)
    # noise removal
    kernel = np.ones((3, 3), np.uint8)
    opening = cv2.morphologyEx(thresh, cv2.MORPH_OPEN, kernel, iterations=1)
    # sure background area
    sure_bg = cv2.dilate(opening, kernel, iterations=1)
    plt.imshow(sure_bg)
    sum = 0
    for i in range(32):
        if Counter(sure_bg[i])[0] != 0:
            sum += Counter(sure_bg[i])[0]
    _diameter_mm = sum * 0.264583 * 0.65
    _img = []
    _img.append(np.load(path))
    img = np.array(_img)
    img = img.astype('float32')
    print(img.shape)
    img = img.reshape(img.shape[0], 32, 32, 1)
    print(img.shape)
    with graph.as_default():
        y = model.predict_classes(img)
    print(y)
    if y[0] == 0 and _diameter_mm <= 0:
        prediction = "Normal"
        treatment = "Stay healthy :) "
    elif y[0] == 0 and _diameter_mm > 0 and _diameter_mm < 10:
        prediction = "Stage 1"
        treatment = summarize(state_2, word_count=100).replace("\n", "")
    else:
        prediction = "stage 2 "
        treatment = summarize(state_1, word_count=100).replace("\n", "")
    return render_template("elements.html",
                           calculate_diameter=round(_diameter_mm, 2),
                           prediction=prediction,
                           treatment=treatment)
		st=st+i+'\n'
	return st

try:
	path='/home/harikrishnan-midhun/Desktop/MainPro/AspectLevelSumm/display'

	with open('/home/harikrishnan-midhun/Desktop/MainPro/hum_summ', 'r') as myfile:
		hum_summ=myfile.read()
		hls=hum_summ.splitlines()
		for i in hls:
			if i != " ":
				hum_summ=hum_summ+i
	with open(path, 'r') as myfile:
		data=myfile.read()#.replace('\n', '.')
		data=antiDup(data)
		sum1= summarize(data)
	print "\nSummary Using Text Rank\n______________________________\n"
	print sum1
	dic=begin(path)
	s=dictostr(dic)
	s=antiDup(s)
	print "\n\nSummary Using Hybrid TFIDF\n___________________________\n"
	print s
	# print "\nHuman Summary\n______________________________\n"
	# print hum_summ

	rouge = Rouge()
	#scorestf = rouge.get_scores(hum_summ,s)
	#print "\nRouge Score of TFIDF\n____________________________\n"
	#print scorestf
def text_summarization_gensim(text, summary_ratio=0.5):
    
    summary = summarize(text, split=True, ratio=summary_ratio)
    for sentence in summary:
        print sentence