Esempio n. 1
0
def aggregate_file(interval, file_name, start=None):
    """ Aggregate the data within the windows of time

        interval:       time in seconds to aggregate data
        file_name:      which file to record
        start:          start time to record data, if none given then it starts
                        from te beginning.

        returns: array of the aggregated data in each interval
    """
    if start is None:
        start = get_start_time_for(file_name)

    start = datetime.strptime(start, TIME_FORMAT)
    summaries = [Summarizer() for _ in range(10)]
    with open(file_name, 'r+') as data:
        headers = data.readline().strip().lower().split(',')
        for line in data:
            args = line.strip().split(',')
            time = datetime.strptime(args[0], TIME_FORMAT)
            window = int((time - start).total_seconds() / interval)
            if window < 0:
                continue
            if window >= len(summaries):
                for i in range(window + 1):
                    summaries.append(Summarizer())
            item = dict(zip(headers, args))
            summaries[window].add(item)

    return [s for s in summaries if s.used]
def summarization():
    try:
        json_data = request.get_json(force=True)

        text = json_data.get('text')
        method = json_data.get('method', 'T5')
        pretrained = json_data.get('pretrained', 't5-large')

        # initialize Summarizer
        s = Summarizer(method=method, pretrained=pretrained)

        pred = s.summarize(text)
        if isinstance(pred, dict):
            summary = pred['summary']
            data = dict()
            data['text'] = text
            data['summary'] = summary
            data['summary_len'] = Utility.get_doc_length(summary)
            data['text_len'] = Utility.get_doc_length(text)
            return json.dumps(data)
        else:
            return json.dumps(pred)

    except Exception as e:
        return {"Error": str(e)}
Esempio n. 3
0
def Summ(spacy_model, body, ratio, coreference, greedyness, min_lenght,
         max_lenght, bert_model):

    # Loading Spanish BERT custom model and tokenizer (output_hidden_states must be set to True).
    custom_config = AutoConfig.from_pretrained(
        pretrained_model_name_or_path=bert_model)
    custom_config.output_hidden_states = True
    custom_tokenizer = AutoTokenizer.from_pretrained(bert_model)
    custom_model = AutoModel.from_pretrained(bert_model, config=custom_config)

    #Summarization
    if coreference:
        # With Coreference
        handler = CoreferenceHandler(spacy_model=spacy_model,
                                     greedyness=greedyness)
        model = Summarizer(custom_model=custom_model,
                           custom_tokenizer=custom_tokenizer,
                           sentence_handler=handler)
        result = model(body, min_length=min_lenght)

    else:
        # Without Coreference
        model = Summarizer(custom_model=custom_model,
                           custom_tokenizer=custom_tokenizer)
        result = model(body,
                       ratio=ratio,
                       min_length=min_lenght,
                       max_lenght=max_lenght)

    summarized_text = ''.join(result)
    return summarized_text
Esempio n. 4
0
def test_run():
    if path.exists("keys"):
        api = Summarizer("Manchester", "manchester", "keys")
        assert api.keyToVideo() == 0
    else:
        with open("data.json", "r") as f:
            return json.load(f)
Esempio n. 5
0
def get_weight_sentence(document):
    text = document.text
    theme = document.theme
    lang = document.language
    nlp = nlp_it
    if lang == 'italian':
        model = Summarizer(sentence_handler=SentenceHandler(language=Italian))
        args = get_words(text, nlp)
    else:
        model = Summarizer(sentence_handler=SentenceHandler(language=Russian))
        args = get_words_for_ru(text)
    document.args = str(args)
    document.save()
    sentences = sent_tokenize(text)
    if lang == 'italian':
        sorted_sentences = get_sorted_sentence(sentences, nlp, text, args,
                                               theme, lang)
    else:
        sorted_sentences = get_sorted_sentence_for_ru(sentences, text, args,
                                                      theme, lang)
    note = generate(sentences, sorted_sentences)
    note_with_ml = model(text)
    note_item = Note(document_id=document,
                     text=note,
                     text_for_algo="",
                     text_for_ml=note_with_ml)
    note_item.save()
Esempio n. 6
0
def post():
    try:
        url = request.args.get('url')
        summarizer = Summarizer(url)
        return render_template('post.html', text=Markup(summarizer.summarize()), img= summarizer.article.top_image, title=summarizer.title), 200
    except:
        raise InvalidUsage('Oops something got wrong. Please try again', status_code=404)
Esempio n. 7
0
def process_text(text):
    """
    Tries to get the summary, intro, keywords
    Return is_valid indicating if text was okay
    :param text:
    :param title:
    :return:
    """

    is_valid = True

    keywords = []
    summarizer = Summarizer(text)
    try:
        summary = summarizer.summarize()

        keywords = summarizer.keywords(number_keywords=number_keywords)
        phrases = summarizer.key_noun_phrases(
            number_noun_phrases=number_noun_phrase)
        phrases = [phrase.replace(" ", "_") for phrase in phrases]

        keywords = keywords + phrases

    except SummaryException:
        summary = [""]
        is_valid = False

    summary = " ".join(summary)

    return is_valid, summary, keywords
Esempio n. 8
0
def inference(_id, corrector, segmenter):
    res = getBodyTopic(_id)
    body = res['text']
    topic = res['topic']
    username = res['username']
    length = res['length']
    body = preProcessTranscript(body, corrector)
    entities = entityRecog(_id)
    # topic = preProcessTranscript(topic)
    meeting_sentiment = sentiment(_id)
    # body = correct_sentence__test(body, corrector, segmenter)
    model = Summarizer()
    data = model.get_summary(body, topic)
    size = len(data)
    result = ""
    if size < length:
        result = [result + data[i]['sentence'] for i in range(len(data))]
    else:
        result = [result + data[i]['sentence'] for i in range(length)]

    result = " ".join(result)
    res = saveSummary(_id, result, _id,
                      username, length, meeting_sentiment, entities, topic)

    return {"_id": str(res.inserted_id)}
Esempio n. 9
0
def summarize():
    url = request.args.get('url')
    summarizer = Summarizer(url)
    if url != False:
    	summary = summarizer.summarize()
    	response = {'title': summarizer.title, 'summary': summary, 'img': summarizer.article.top_image}
    	return jsonify(response)
    raise InvalidUsage('Oops something got wrong. Please try again', status_code=404)
Esempio n. 10
0
class TextTeaser(object):
  def __init__(self):
    self.summarizer = Summarizer()

  def summarize(self, title, text, category = "Undefined", source = "Undefined", count = 5):
    result = self.summarizer.summarize(text, title, source, category)
    result = self.summarizer.sortSentences(result[:count])
    result = [res['sentence'] for res in result]

    return result
    def test_generate_summary(self):
        """Various test cases for summarizer.generate_summary."""
        summarizer_simple = Summarizer(self.simple_dataframe, self.config)
        output_df_simple = summarizer_simple.generate_summary()
        self.assertEqual(len(output_df_simple), 3)
        self.assertIn(1, output_df_simple['Size'].values)
        self.assertIn('test', output_df_simple['Text'].values)
        self.assertIn('test2', output_df_simple['Text'].values)
        self.assertIn('test3', output_df_simple['Text'].values)
        self.assertIn('', output_df_simple['ClassLines'].values)

        summarizer_stack_lines = Summarizer(self.stack_lines_dataframe,
                                            self.config)
        output_df_stack_lines = summarizer_stack_lines.generate_summary()
        self.assertIn('some.class.java',
                      output_df_stack_lines['ClassLines'].values)
        self.assertIn('some.class2.java',
                      output_df_stack_lines['ClassLines'].values)
        self.assertIn('some.class3.java',
                      output_df_stack_lines['ClassLines'].values)
        self.assertIn('', output_df_stack_lines['Text'].values)

        summarizer_multi_cluster = Summarizer(self.multi_cluster_dataframe,
                                              self.config)
        output_df_multi_cluster = summarizer_multi_cluster.generate_summary()
        self.assertEqual(len(output_df_multi_cluster), 2)
        self.assertIn(2, output_df_multi_cluster['Size'].values)
        self.assertIn(1, output_df_multi_cluster['Size'].values)
Esempio n. 12
0
def pre_processing(list_lineInDoc,
                   caseIdx,
                   output_path="./output/train_data_preprocessed.txt",
                   separator=DEFAULT_SEPARATOR,
                   write_valid=True,
                   logPrint=False):
    '''
    To Jeff:
    Implement here.
    '''

    if (logPrint == True):
        print("")
        print("\tPre-processing law case [%d]" % caseIdx)

    if (caseIdx == 0 and logPrint == True):
        print("")
        print("\t\t1.Remove \\n, \\t")
        print("\t\t2.Remove Punctuations.")
        print("\t\t3.Split Words.")
        print("\t\t4.Remove StopWords.")
        print("\t\t5.Lemmatize Words.")

    ## 1. Remove '\n','\t', etc and make it readable.
    for idx in range(len(list_lineInDoc)):
        list_lineInDoc[idx] = " ".join(list_lineInDoc[idx].replace(
            "\n", " ").split())

    ## 2. Analyse catchphrase and sentences.
    catchphrase = list_lineInDoc[0]
    sentences = list_lineInDoc[1:]

    summarizer = Summarizer()
    result = summarizer.summarize(sentences, catchphrase)

    ## 3. Save result.
    if (write_valid == True):
        f = open(output_path, 'a')
        f.write(separator + str(caseIdx))
        f.write('\n')

        for i in range(0, len(result)):

            word_stat_line = result[i][0]
            for j in range(0, len(word_stat_line)):
                f.write(str(word_stat_line[j]['word']))
                f.write(":")
                f.write(str(word_stat_line[j]['count']))
                f.write(" ")

            f.write('\n')
        f.close()

    return result
Esempio n. 13
0
class TextTeaser(object):
    def __init__(self):
        self.summarizer = Summarizer()

    def summarize(self, text, count):
        result = self.summarizer.summarize(text)
        #print(result)
        result = self.summarizer.sortSentences(result[:count])
        result = [res['sentence'] for res in result]

        return result
Esempio n. 14
0
class nlps(object):

    def __init__(self):
        self.summarizer = Summarizer()

    def summarize(self, title, text, category="Undefined", source="Undefined", count=5):
        result = self.summarizer.summarize(text, title, source, category)
        result = self.summarizer.sortSentences(result[:count])
        result = [res['sentence'] for res in result]

        return result
Esempio n. 15
0
def worker():
    i = 0
    while True:
        item = q.get()
        if item is None:
            print("Break ! cuz item is None")
            break
        api = Summarizer(item[0], item[1], "keys")
        api.keyToVideo()
        i += 1
        print("-----Task{0}----".format(i))
        q.task_done()
Esempio n. 16
0
def summary_of_detection(filename,
                         model,
                         use_bots=False,
                         use_attack=False,
                         sample=False,
                         use_ahead=False,
                         steps_ahead=1,
                         trees=50,
                         norm_and_standardize=False):
    """General call to train and test any model under any features.
    Args:
        filename: Aggregated file to get features and labels from
        model: Abbreviated model name to use (rf: Random Forest, dl: Deep Learning, dt: Decision Trees)
        use_bots: Use bot type as the label (Only general-*.featureset.csv files have this information)
        use_attack: Use attack type as the label (only all-*.featureset.csv files have this information)
        sample: Sample the featureset so that there is an equal number of attack and normal labels
        use_ahead: Works only on files aggregated with `--use_separator` flag turned on.
        steps_ahead: Only used if use_ahead is true. The amount if time windows ahead to skip to get the new label
                     for the current feature set.
        trees: The number of trees to use on Random Forest.
        norm_and_standardize: The normalize and standardize the numerical features.
    Returns:
        The accuracy, precision, recall, and f1_score of the model.
    """
    if use_ahead:
        xtrain, xtest, ytrain, ytest = get_ahead_feature_labels(
            filename,
            Summarizer().features, steps_ahead)
    else:
        xtrain, xtest, ytrain, ytest = get_features_labels_from(
            filename,
            Summarizer().features,
            use_bots,
            use_attack,
            sample=sample,
            norm_and_standardize=norm_and_standardize)
    if model == 'rf':
        clf = rf_train(xtrain, ytrain, use_attack, use_ahead, trees=trees)
    elif model == 'dt':
        clf = dt_train(xtrain, ytrain)
    elif model == 'dl':
        if use_bots:
            # Multi-label needs to be converted into a sparse array for Tensorflow.
            ytrain = to_tf_labels(ytrain)
            ytest = to_tf_labels(ytest)
        clf = dl_train(xtrain, ytrain, use_bots)
        return dl_test(clf, xtrain, ytrain, use_bots)

    if use_attack:
        return [
            x['micro'] for x in test(clf, xtest, ytest, use_bots, use_attack)
        ]
    return test(clf, xtest, ytest, use_bots, use_ahead=use_ahead)
Esempio n. 17
0
class ReviewSummaryAction(Action):
    summarizer = Summarizer()

    @staticmethod
    def review_summarization(reviews):
        message = "\n"
        for review in reviews:
            result = ReviewSummaryAction.summarizer(review["text"])
            full = ''.join(result)
            if full != '':
                message += "\n-----\n" + full + " - " + review[
                    "author_name"] + ", " + review["relative_time_description"]
        return message

    def name(self) -> Text:
        return "review_summary_action"

    def run(self, dispatcher: CollectingDispatcher, tracker: Tracker,
            domain: Dict[Text, Any]) -> List:
        facility_details = tracker.get_slot("facility_details")
        if facility_details is None:
            error_message = "You have to choose a facility first"
            dispatcher.utter_message(error_message)
            return []

        if "reviews" not in facility_details:
            review_message = ATMOSPHERE_DICT.get("review_messages")[-1]
            dispatcher.utter_message(review_message)
        else:
            summary_message = ReviewSummaryAction.review_summarization(
                facility_details["reviews"])
            print(summary_message)
            dispatcher.utter_message(summary_message)
Esempio n. 18
0
def main():
    #ask user for how many summaries they want
    print(
        "Welcome to CFS's Summary generator. This pulls articles from NLTK's reuter's corpus and summarizers them\n"
    )
    try:
        numarts = eval(
            input("How many articles do you wish us to summarize?: "))
    except Exception:
        print("Uh oh, looks like that wasn't a number... \n Please try again")
        return

    #prevents users from asking for negative numbers or more than 5 at a time.
    if numarts < 1:
        print(
            "I understand why you wouldn't want to read summaries of financial articles, but come on, at least one? =[ "
        )
        return
    if numarts > 5:
        print(
            "Okay, do you really want to read more than five bland reuters articles summaries at a time?"
        )
        print("Please try a more sensible number")
        return

    print()
    # create our summarizer
    cfs = Summarizer()
    getSummaries(cfs, numarts)
Esempio n. 19
0
def summarize(body):

    model = Summarizer()
    result = model(body, min_length=20)
    full = ''.join(result)

    return full
def load_summarizer():
    try:
        summ = pickle.load(open('./models/summarizer.pkl', 'rb'))
    except:
        summ = Summarizer()
        pickle.dump(summ, open('./models/summarizer.pkl', 'wb'))
    return summ
def text_summarization(stt_file_path):
    """ this method take the speech to text output file path
        ==> apply bert extractive text summarization
        ==> return the text summary file path
    """
    try:
        #Read Data From input File (sumtext.txt)
        inputfile = open(stt_file_path, "r")
        datainput = (inputfile.read())

        print("Summarization processing ......")

        model = Summarizer()
        result = model(datainput, min_length=10)
        output_summary = ''.join(result)

        #Write Data in output File (bert-extractive-summarizer.txt)
        filename = 'summary-bert' + str(
            uuid.uuid4()) + '.txt'  #genere a unique file name
        out_path = 'upload/text_summarization/' + filename
        path = os.path.join(MEDIA_ROOT, out_path)

        file = open(path, "a")
        file.write(output_summary)
        file.close()

        print('sucess summary >>>>>>')

        return out_path
    # pylint: disable=bare-except
    except:
        print("An exception occurred in text summarization")
def load_summarizer():
    try:
        model = pickle.load(open('./models/summarizer.pkl', 'rb'))
    except:
        model = Summarizer()
        pickle.dump(model, open('./models/summarizer.pkl', 'wb'))
    return model
 def load(self):
     try:
         logging.info('Loading BERTExtractiveSummarizer...')
         BERTExtractiveSummarizer.model = Summarizer()
     except Exception as e:
         raise ModelFailedLoadException(
             f'Failed to load BERTExtractiveSummarizer with {str(e)}')
Esempio n. 24
0
 def get_summarizer_model(self):
     if self.summarizer_model is None:
         nlp = spacy.load('en_core_web_md',
                          disable=['ner', 'parser', 'tagger'])
         nlp.add_pipe(nlp.create_pipe('sentencizer'))
         self.summarizer_model = Summarizer()
     return self.summarizer_model
Esempio n. 25
0
def bert_summarizer(data):
    summarizer_bert = Summarizer()
    summary_bert = summarizer_bert(data, min_length=30, max_length=140)
    print('Bert for Text - Summarization')
    summary = ''.join(summary_bert)
    rouge_scores = rouge.get_scores(summary, data)
    return summary, rouge_scores
def test_multi_hidden_concat(summarizer_multi_hidden: Summarizer, passage):
    summarizer_multi_hidden.hidden_concat = True
    res = summarizer_multi_hidden(passage,
                                  num_sentences=5,
                                  min_length=40,
                                  max_length=500)
    assert len(res) > 10
Esempio n. 27
0
def texte_summarizer(input_texte, ratio=0.5):
    """
    INPUT:
        input_texte : str - il s'agit du texte que l'on souhaite résumer
        ratio : float (compris entre 0 et 1) - pourcentage de réduction du texte, si ratio = 0.5 on va garder 50% des phrases du texte d'origine

    OUTPUT:
        dico : dict - {'summary_texte' : str result} avec result le texte résumé

    BODY:
        On charge depuis le dossier static/model le modele de NLP à utiliser puis on l'applique au texte à résumer
    """
    try:
        custom_config = AutoConfig.from_pretrained(
            "static/model/camembert-base")
        custom_config.output_hidden_states = True
        custom_tokenizer = AutoTokenizer.from_pretrained(
            "static/model/camembert-base")
        custom_model = AutoModel.from_pretrained("static/model/camembert-base",
                                                 config=custom_config)
        model = Summarizer(
            custom_model=custom_model,
            custom_tokenizer=custom_tokenizer,
        )
        result = model(input_texte, ratio=ratio)
        dico = {'summary_text': result}
    except:
        import_bert()
        dico = texte_summarizer(input_texte, ratio)
    return dico
Esempio n. 28
0
 def load(model: str, tokenizer: BertTokenizer, device=None) -> Summarizer:
     config = BertConfig.from_pretrained(model)
     config.output_hidden_states = True
     bert_model = BertModel.from_pretrained(model, config=config)
     if device is not None:
         bert_model = bert_model.to(device)
     return Summarizer(custom_model=bert_model, custom_tokenizer=tokenizer)
Esempio n. 29
0
def main():
    # Parse the JSON arguments
    try:
        config_args = parse_args()
    except:
        print("Add a config file using \'--config file_name.json\'")
        exit(1)

    # Create the experiment directories
    _, config_args.summary_dir, config_args.checkpoint_dir = create_experiment_dirs(
        config_args.experiment_dir)

    # Reset the default Tensorflow graph
    tf.reset_default_graph()

    # Tensorflow specific configuration
    config = tf.ConfigProto(allow_soft_placement=True)
    config.gpu_options.allow_growth = True
    sess = tf.Session(config=config)

    # Data loading
    # The batch size is equal to 1 when testing to simulate the real experiment.
    data_batch_size = config_args.batch_size if config_args.train_or_test == "train" else 1
    data = DataLoader(data_batch_size, config_args.shuffle)
    print("Loading Data...")
    config_args.img_height, config_args.img_width, config_args.num_channels, \
    config_args.train_data_size, config_args.test_data_size = data.load_data()
    print("Data loaded\n\n")

    # Model creation
    print("Building the model...")
    model = ShuffleNet(config_args)
    print("Model is built successfully\n\n")

    # Summarizer creation
    summarizer = Summarizer(sess, config_args.summary_dir)
    # Train class
    trainer = Train(sess, model, data, summarizer)

    if config_args.train_or_test == 'train':
        try:
            print("FLOPs for batch size = " + str(config_args.batch_size) +
                  "\n")
            calculate_flops()
            print("Training...")
            trainer.train()
            print("Training Finished\n\n")
        except KeyboardInterrupt:
            trainer.save_model()

    elif config_args.train_or_test == 'test':
        print("FLOPs for single inference \n")
        calculate_flops()
        # This can be 'val' or 'test' or even 'train' according to the needs.
        print("Testing...")
        trainer.test('val')
        print("Testing Finished\n\n")

    else:
        raise ValueError("Train or Test options only are allowed")
Esempio n. 30
0
def extSumm(full):
    orgi = full
    model = Summarizer(
        model='distilbert-base-uncased')  # can adjust parameters
    #summa = model(orgi, ratio=0.3, min_length=60) # can adjust parameters
    summa = model(orgi, ratio=0.2, min_length=60)
    return summa
def run():
    parser = argparse.ArgumentParser(
        description='Process and summarize lectures')
    parser.add_argument('-path',
                        dest='path',
                        default=None,
                        help='File path of lecture')
    parser.add_argument('-lang',
                        dest='lang',
                        default='en',
                        help='Language supported : en,fr')
    parser.add_argument('-model',
                        dest='model',
                        default='bert-large-uncased',
                        help='')
    parser.add_argument('-num-sentences',
                        dest='num_sentences',
                        default=-1,
                        help='Will return X sentences')
    parser.add_argument(
        '-ratio',
        dest='ratio',
        default=-1,
        help=
        'Will return a ratio of sentences from the text length (0.2 is a good value)'
    )
    parser.add_argument('-hidden',
                        dest='hidden',
                        default=-2,
                        help='Which hidden layer to use from Bert')
    parser.add_argument('-reduce-option',
                        dest='reduce_option',
                        default='mean',
                        help='How to reduce the hidden layer from bert')
    parser.add_argument('-greedyness',
                        dest='greedyness',
                        help='Greedyness of the NeuralCoref model',
                        default=0.45)
    args = parser.parse_args()

    if not args.path:
        raise RuntimeError("Must supply text path.")

    with open(args.path) as d:
        text_data = d.read()

    spacy_model = sentence_handlers[args.lang]
    model = Summarizer(
        model=args.model,
        hidden=args.hidden,
        reduce_option=args.reduce_option,
        sentence_handler=CoreferenceHandler(spacy_model=spacy_model))

    if int(args.num_sentences) >= 0:
        result = model(text_data, num_sentences=int(args.num_sentences))
    elif float(args.ratio) >= 0.0:
        result = model(text_data, ratio=float(args.num_sentences))
    else:
        result = model(text_data)
    print(result)
Esempio n. 32
0
 def Summary(self):
     summ = dict.fromkeys(self.clust[0].unique())
     # Will store the name of Cluster based on article name
     subtitles = []
     # Will store the url of main article summarizing the cluster
     urls = []
     model = Summarizer()
     # Create the summary by aggregating all the bodys of the articles
     for c in summ.keys():
         df2 = self.clust[self.clust[0] == c]
         subtitles = subtitles + [self.results.loc[df2.index[0]]['title']]
         urls = urls + [self.results.loc[df2.index[0]]['url']]
         full_text = ' '.join(self.results.loc[df2.index]['body'].unique())
         full_text = full_text.replace('\n', ' ')
         length = len(full_text)
         if length < 500:
             target = 200
             ratiol = min(1, target / length)
         else:
             target = min(self.max_len + 150 * (len(df2)),
                          length / len(df2) * (0.15 + 0.05 * len(df2)))
             ratiol = min(1, target / length)
         summ[c] = model(full_text, min_length=60, ratio=ratiol)
     self.summary = summ
     self.subtitles = subtitles
     self.urls = urls
Esempio n. 33
0
async def process(ctx):
    file_path = Formatter(ctx.channel.id).getInputFilePath(
        int(open("counter.txt", 'r').readline()))
    input_text = ''.join(open(file_path, 'r').readlines())
    output = await Summarizer().summarize(ctx, input_text)
    Summarizer().save_summaries(output,
                                int(open("counter.txt", 'r').readline()))
Esempio n. 34
0
#!-*- coding: utf-8 -*-
from goose import Goose
from goose.text import StopWordsChinese

from summarizer import Summarizer

import json

g = Goose({'stopwords_class': StopWordsChinese})
url = 'http://sports.sina.com.cn/nba/2013-10-29/00086855748.shtml'
article = g.extract(url=url)
title = article.title
print title
text = article.cleaned_text
print text
summary = Summarizer()
summary_list = summary.summarize(title, text)

print 'summary is below:'

for sentence in summary_list:
    print sentence

summary_json = json.dumps(summary_list)
print summary_json





Esempio n. 35
0
def getInput():
    with open('input.txt') as file:
        content = file.readlines()

    # remove unnecessary \n
    content = [c.replace('\n', '') for c in content if c != '\n']

    title = content[0]
    text = content[-(len(content) - 1):]

    return {'title': title, 'text': ' '.join(text)}

# #####################

input = getInput()
input['text'] = input['text'].decode("ascii", "ignore")

input['text'] = " ".join(input['text'].replace("\n", " ").split())

summarizer = Summarizer()
result = summarizer.summarize(input['text'], input['title'], 'Undefined', 'Undefined')
result = summarizer.sortScore(result)
result = summarizer.sortSentences(result[:30])

print 'Summary:'

for r in result:
    print r['sentence']
    # print r['totalScore']
    # print r['order']
Esempio n. 36
0
 def __init__(self):
     self.summarizer = Summarizer()
Esempio n. 37
0
if __name__ == '__main__':
    mongo_client = MongoClient()
    db = mongo_client.g_project_data
    coll = db.test_data

    summary_list, article_list = get_summaries_and_articles(coll)

    mongo_client.close()

    idf = unpickle('idf')
    vocab = unpickle('vocab')

    count = CountVectorizer(vocabulary=vocab, stop_words='english')

    summarizer_multi = Summarizer(vocab=vocab, idf=idf, vectorizer=count, scoring='multi_Tfidf')
    summarizer_single = Summarizer(vocab=vocab, idf=idf, vectorizer=count, scoring='single_Tfidf')
    summarizer_sig = Summarizer(vocab=vocab, idf=idf, vectorizer=count, scoring='significance')
    summarizer_sim = Summarizer(vocab=vocab, idf=idf, vectorizer=count, scoring='similarity')
    summarizer_rand = Summarizer(vocab=vocab, idf=idf, vectorizer=count, scoring='random')

    multi_r2 = []
    multi_reduction2 = []
    single_r2 = []
    single_reduction2 = []
    sig_r2 = []
    sig_reduction2 = []
    sim_r2 = []
    sim_reduction2 = []
    rand_r2 = []
    rand_reduction2 = []